summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/backtracetest.c1
-rw-r--r--kernel/bpf/Makefile8
-rw-r--r--kernel/bpf/bpf_local_storage.c4
-rw-r--r--kernel/bpf/bpf_lsm.c1
-rw-r--r--kernel/bpf/bpf_struct_ops.c77
-rw-r--r--kernel/bpf/btf.c509
-rw-r--r--kernel/bpf/core.c8
-rw-r--r--kernel/bpf/cpumap.c35
-rw-r--r--kernel/bpf/crypto.c42
-rw-r--r--kernel/bpf/devmap.c57
-rw-r--r--kernel/bpf/helpers.c263
-rw-r--r--kernel/bpf/log.c6
-rw-r--r--kernel/bpf/memalloc.c9
-rw-r--r--kernel/bpf/syscall.c44
-rw-r--r--kernel/bpf/task_iter.c9
-rw-r--r--kernel/bpf/verifier.c328
-rw-r--r--kernel/cgroup/cgroup.c36
-rw-r--r--kernel/cgroup/cpuset.c197
-rw-r--r--kernel/cgroup/misc.c80
-rw-r--r--kernel/cgroup/pids.c129
-rw-r--r--kernel/cgroup/rstat.c37
-rw-r--r--kernel/configs/hardening.config1
-rw-r--r--kernel/cpu.c19
-rw-r--r--kernel/crash_reserve.c1
-rw-r--r--kernel/debug/kdb/kdb_bt.c2
-rw-r--r--kernel/debug/kdb/kdb_io.c6
-rw-r--r--kernel/debug/kdb/kdb_main.c18
-rw-r--r--kernel/debug/kdb/kdb_private.h2
-rw-r--r--kernel/delayacct.c2
-rw-r--r--kernel/dma/direct.c10
-rw-r--r--kernel/dma/direct.h9
-rw-r--r--kernel/dma/map_benchmark.c16
-rw-r--r--kernel/dma/mapping.c2
-rw-r--r--kernel/dma/swiotlb.c68
-rw-r--r--kernel/events/callchain.c47
-rw-r--r--kernel/events/core.c188
-rw-r--r--kernel/events/internal.h6
-rw-r--r--kernel/events/ring_buffer.c7
-rw-r--r--kernel/events/uprobes.c35
-rw-r--r--kernel/exit.c86
-rw-r--r--kernel/fork.c64
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/irq/debugfs.c10
-rw-r--r--kernel/irq/devres.c41
-rw-r--r--kernel/irq/generic-chip.c111
-rw-r--r--kernel/irq/internals.h10
-rw-r--r--kernel/irq/irq_sim.c60
-rw-r--r--kernel/irq/irqdomain.c290
-rw-r--r--kernel/irq/manage.c2
-rw-r--r--kernel/irq/msi.c95
-rw-r--r--kernel/irq/proc.c7
-rw-r--r--kernel/jump_label.c74
-rw-r--r--kernel/kallsyms.c5
-rw-r--r--kernel/kallsyms_internal.h6
-rw-r--r--kernel/kcsan/kcsan_test.c1
-rw-r--r--kernel/kexec_core.c2
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/latencytop.c2
-rw-r--r--kernel/livepatch/core.c17
-rw-r--r--kernel/locking/lockdep.c10
-rw-r--r--kernel/locking/locktorture.c1
-rw-r--r--kernel/locking/qspinlock.c2
-rw-r--r--kernel/locking/rwsem.c6
-rw-r--r--kernel/locking/spinlock.c8
-rw-r--r--kernel/module/main.c5
-rw-r--r--kernel/panic.c116
-rw-r--r--kernel/pid_namespace.c19
-rw-r--r--kernel/pid_sysctl.h2
-rw-r--r--kernel/power/swap.c5
-rw-r--r--kernel/printk/console_cmdline.h1
-rw-r--r--kernel/printk/internal.h2
-rw-r--r--kernel/printk/printk.c111
-rw-r--r--kernel/printk/sysctl.c2
-rw-r--r--kernel/rcu/rcuscale.c1
-rw-r--r--kernel/rcu/rcutorture.c49
-rw-r--r--kernel/rcu/refscale.c1
-rw-r--r--kernel/rcu/srcutiny.c3
-rw-r--r--kernel/rcu/srcutree.c13
-rw-r--r--kernel/rcu/sync.c12
-rw-r--r--kernel/rcu/tasks.h26
-rw-r--r--kernel/rcu/tree.c92
-rw-r--r--kernel/rcu/tree.h2
-rw-r--r--kernel/rcu/tree_exp.h24
-rw-r--r--kernel/rcu/tree_nocb.h147
-rw-r--r--kernel/rcu/tree_plugin.h14
-rw-r--r--kernel/rcu/tree_stall.h4
-rw-r--r--kernel/resource.c68
-rw-r--r--kernel/resource_kunit.c1
-rw-r--r--kernel/scftorture.c3
-rw-r--r--kernel/sched/build_policy.c1
-rw-r--r--kernel/sched/clock.c4
-rw-r--r--kernel/sched/core.c1901
-rw-r--r--kernel/sched/core_sched.c2
-rw-r--r--kernel/sched/cputime.c14
-rw-r--r--kernel/sched/deadline.c15
-rw-r--r--kernel/sched/fair.c30
-rw-r--r--kernel/sched/idle.c12
-rw-r--r--kernel/sched/loadavg.c4
-rw-r--r--kernel/sched/pelt.c4
-rw-r--r--kernel/sched/psi.c81
-rw-r--r--kernel/sched/rt.c30
-rw-r--r--kernel/sched/sched.h435
-rw-r--r--kernel/sched/stats.h13
-rw-r--r--kernel/sched/syscalls.c1699
-rw-r--r--kernel/sched/topology.c14
-rw-r--r--kernel/sched/wait_bit.c4
-rw-r--r--kernel/seccomp.c32
-rw-r--r--kernel/signal.c8
-rw-r--r--kernel/smp.c5
-rw-r--r--kernel/stackleak.c2
-rw-r--r--kernel/sys_ni.c4
-rw-r--r--kernel/sysctl-test.c50
-rw-r--r--kernel/sysctl.c95
-rw-r--r--kernel/task_work.c58
-rw-r--r--kernel/time/clocksource-wdtest.c1
-rw-r--r--kernel/time/test_udelay.c1
-rw-r--r--kernel/time/tick-broadcast.c23
-rw-r--r--kernel/time/tick-sched.c22
-rw-r--r--kernel/time/time_test.c1
-rw-r--r--kernel/time/timekeeping.c131
-rw-r--r--kernel/time/timer.c2
-rw-r--r--kernel/torture.c1
-rw-r--r--kernel/trace/bpf_trace.c15
-rw-r--r--kernel/trace/fgraph.c1054
-rw-r--r--kernel/trace/ftrace.c691
-rw-r--r--kernel/trace/ftrace_internal.h18
-rw-r--r--kernel/trace/pid_list.c5
-rw-r--r--kernel/trace/trace.c2
-rw-r--r--kernel/trace/trace.h93
-rw-r--r--kernel/trace/trace_events_user.c2
-rw-r--r--kernel/trace/trace_functions.c15
-rw-r--r--kernel/trace/trace_functions_graph.c96
-rw-r--r--kernel/trace/trace_irqsoff.c10
-rw-r--r--kernel/trace/trace_kprobe.c192
-rw-r--r--kernel/trace/trace_osnoise.c4
-rw-r--r--kernel/trace/trace_sched_wakeup.c10
-rw-r--r--kernel/trace/trace_selftest.c259
-rw-r--r--kernel/trace/trace_stack.c2
-rw-r--r--kernel/tsacct.c2
-rw-r--r--kernel/umh.c2
-rw-r--r--kernel/utsname_sysctl.c4
-rw-r--r--kernel/vmcore_info.c4
-rw-r--r--kernel/watchdog.c12
-rw-r--r--kernel/watchdog_perf.c11
-rw-r--r--kernel/workqueue.c352
145 files changed, 7291 insertions, 4221 deletions
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index a4181234232b..2dfe66b9ed76 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -74,5 +74,6 @@ static void exitf(void)
module_init(backtrace_regression_test);
module_exit(exitf);
+MODULE_DESCRIPTION("Simple stack backtrace regression test module");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 7eb9ad3a3ae6..0291eef9ce92 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -50,5 +50,11 @@ endif
obj-$(CONFIG_BPF_PRELOAD) += preload/
obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
-$(obj)/relo_core.o: $(srctree)/tools/lib/bpf/relo_core.c FORCE
+obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o
+
+# Some source files are common to libbpf.
+vpath %.c $(srctree)/kernel/bpf:$(srctree)/tools/lib/bpf
+
+$(obj)/%.o: %.c FORCE
$(call if_changed_rule,cc_o_c)
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 976cb258a0ed..c938dea5ddbf 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -782,8 +782,8 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
nbuckets = max_t(u32, 2, nbuckets);
smap->bucket_log = ilog2(nbuckets);
- smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
- nbuckets, GFP_USER | __GFP_NOWARN);
+ smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets,
+ sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN);
if (!smap->buckets) {
err = -ENOMEM;
goto free_smap;
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 68240c3c6e7d..08a338e1f231 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -280,6 +280,7 @@ BTF_ID(func, bpf_lsm_cred_prepare)
BTF_ID(func, bpf_lsm_file_ioctl)
BTF_ID(func, bpf_lsm_file_lock)
BTF_ID(func, bpf_lsm_file_open)
+BTF_ID(func, bpf_lsm_file_post_open)
BTF_ID(func, bpf_lsm_file_receive)
BTF_ID(func, bpf_lsm_inode_create)
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 86c7884abaf8..0d515ec57aa5 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -12,6 +12,7 @@
#include <linux/mutex.h>
#include <linux/btf_ids.h>
#include <linux/rcupdate_wait.h>
+#include <linux/poll.h>
struct bpf_struct_ops_value {
struct bpf_struct_ops_common_value common;
@@ -56,6 +57,7 @@ struct bpf_struct_ops_map {
struct bpf_struct_ops_link {
struct bpf_link link;
struct bpf_map __rcu *map;
+ wait_queue_head_t wait_hup;
};
static DEFINE_MUTEX(update_mutex);
@@ -571,7 +573,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
}
size = arch_prepare_bpf_trampoline(NULL, image + image_off,
- image + PAGE_SIZE,
+ image + image_off + size,
model, flags, tlinks, stub_func);
if (size <= 0) {
if (image != *_image)
@@ -757,7 +759,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
goto unlock;
}
- err = st_ops->reg(kdata);
+ err = st_ops->reg(kdata, NULL);
if (likely(!err)) {
/* This refcnt increment on the map here after
* 'st_ops->reg()' is secure since the state of the
@@ -805,7 +807,7 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
BPF_STRUCT_OPS_STATE_TOBEFREE);
switch (prev_state) {
case BPF_STRUCT_OPS_STATE_INUSE:
- st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data);
+ st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, NULL);
bpf_map_put(map);
return 0;
case BPF_STRUCT_OPS_STATE_TOBEFREE:
@@ -1057,10 +1059,7 @@ static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
st_map = (struct bpf_struct_ops_map *)
rcu_dereference_protected(st_link->map, true);
if (st_map) {
- /* st_link->map can be NULL if
- * bpf_struct_ops_link_create() fails to register.
- */
- st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data);
+ st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link);
bpf_map_put(&st_map->map);
}
kfree(st_link);
@@ -1075,7 +1074,8 @@ static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,
st_link = container_of(link, struct bpf_struct_ops_link, link);
rcu_read_lock();
map = rcu_dereference(st_link->map);
- seq_printf(seq, "map_id:\t%d\n", map->id);
+ if (map)
+ seq_printf(seq, "map_id:\t%d\n", map->id);
rcu_read_unlock();
}
@@ -1088,7 +1088,8 @@ static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
st_link = container_of(link, struct bpf_struct_ops_link, link);
rcu_read_lock();
map = rcu_dereference(st_link->map);
- info->struct_ops.map_id = map->id;
+ if (map)
+ info->struct_ops.map_id = map->id;
rcu_read_unlock();
return 0;
}
@@ -1113,6 +1114,10 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
mutex_lock(&update_mutex);
old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
+ if (!old_map) {
+ err = -ENOLINK;
+ goto err_out;
+ }
if (expected_old_map && old_map != expected_old_map) {
err = -EPERM;
goto err_out;
@@ -1125,7 +1130,7 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
goto err_out;
}
- err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data);
+ err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data, link);
if (err)
goto err_out;
@@ -1139,11 +1144,53 @@ err_out:
return err;
}
+static int bpf_struct_ops_map_link_detach(struct bpf_link *link)
+{
+ struct bpf_struct_ops_link *st_link = container_of(link, struct bpf_struct_ops_link, link);
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *map;
+
+ mutex_lock(&update_mutex);
+
+ map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
+ if (!map) {
+ mutex_unlock(&update_mutex);
+ return 0;
+ }
+ st_map = container_of(map, struct bpf_struct_ops_map, map);
+
+ st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link);
+
+ RCU_INIT_POINTER(st_link->map, NULL);
+ /* Pair with bpf_map_get() in bpf_struct_ops_link_create() or
+ * bpf_map_inc() in bpf_struct_ops_map_link_update().
+ */
+ bpf_map_put(&st_map->map);
+
+ mutex_unlock(&update_mutex);
+
+ wake_up_interruptible_poll(&st_link->wait_hup, EPOLLHUP);
+
+ return 0;
+}
+
+static __poll_t bpf_struct_ops_map_link_poll(struct file *file,
+ struct poll_table_struct *pts)
+{
+ struct bpf_struct_ops_link *st_link = file->private_data;
+
+ poll_wait(file, &st_link->wait_hup, pts);
+
+ return rcu_access_pointer(st_link->map) ? 0 : EPOLLHUP;
+}
+
static const struct bpf_link_ops bpf_struct_ops_map_lops = {
.dealloc = bpf_struct_ops_map_link_dealloc,
+ .detach = bpf_struct_ops_map_link_detach,
.show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
.fill_link_info = bpf_struct_ops_map_link_fill_link_info,
.update_map = bpf_struct_ops_map_link_update,
+ .poll = bpf_struct_ops_map_link_poll,
};
int bpf_struct_ops_link_create(union bpf_attr *attr)
@@ -1176,13 +1223,21 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
if (err)
goto err_out;
- err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data);
+ init_waitqueue_head(&link->wait_hup);
+
+ /* Hold the update_mutex such that the subsystem cannot
+ * do link->ops->detach() before the link is fully initialized.
+ */
+ mutex_lock(&update_mutex);
+ err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data, &link->link);
if (err) {
+ mutex_unlock(&update_mutex);
bpf_link_cleanup(&link_primer);
link = NULL;
goto err_out;
}
RCU_INIT_POINTER(link->map, map);
+ mutex_unlock(&update_mutex);
return bpf_link_settle(&link_primer);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 821063660d9f..520f49f422fe 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -274,6 +274,7 @@ struct btf {
u32 start_str_off; /* first string offset (0 for base BTF) */
char name[MODULE_NAME_LEN];
bool kernel_btf;
+ __u32 *base_id_map; /* map from distilled base BTF -> vmlinux BTF ids */
};
enum verifier_phase {
@@ -414,7 +415,7 @@ const char *btf_type_str(const struct btf_type *t)
struct btf_show {
u64 flags;
void *target; /* target of show operation (seq file, buffer) */
- void (*showfn)(struct btf_show *show, const char *fmt, va_list args);
+ __printf(2, 0) void (*showfn)(struct btf_show *show, const char *fmt, va_list args);
const struct btf *btf;
/* below are used during iteration */
struct {
@@ -530,6 +531,11 @@ static bool btf_type_is_decl_tag_target(const struct btf_type *t)
btf_type_is_var(t) || btf_type_is_typedef(t);
}
+bool btf_is_vmlinux(const struct btf *btf)
+{
+ return btf->kernel_btf && !btf->base_btf;
+}
+
u32 btf_nr_types(const struct btf *btf)
{
u32 total = 0;
@@ -772,7 +778,7 @@ static bool __btf_name_char_ok(char c, bool first)
return true;
}
-static const char *btf_str_by_offset(const struct btf *btf, u32 offset)
+const char *btf_str_by_offset(const struct btf *btf, u32 offset)
{
while (offset < btf->start_str_off)
btf = btf->base_btf;
@@ -1670,14 +1676,8 @@ static void btf_free_kfunc_set_tab(struct btf *btf)
if (!tab)
return;
- /* For module BTF, we directly assign the sets being registered, so
- * there is nothing to free except kfunc_set_tab.
- */
- if (btf_is_module(btf))
- goto free_tab;
for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++)
kfree(tab->sets[hook]);
-free_tab:
kfree(tab);
btf->kfunc_set_tab = NULL;
}
@@ -1735,7 +1735,12 @@ static void btf_free(struct btf *btf)
kvfree(btf->types);
kvfree(btf->resolved_sizes);
kvfree(btf->resolved_ids);
- kvfree(btf->data);
+ /* vmlinux does not allocate btf->data, it simply points it at
+ * __start_BTF.
+ */
+ if (!btf_is_vmlinux(btf))
+ kvfree(btf->data);
+ kvfree(btf->base_id_map);
kfree(btf);
}
@@ -1764,6 +1769,23 @@ void btf_put(struct btf *btf)
}
}
+struct btf *btf_base_btf(const struct btf *btf)
+{
+ return btf->base_btf;
+}
+
+const struct btf_header *btf_header(const struct btf *btf)
+{
+ return &btf->hdr;
+}
+
+void btf_set_base_btf(struct btf *btf, const struct btf *base_btf)
+{
+ btf->base_btf = (struct btf *)base_btf;
+ btf->start_id = btf_nr_types(base_btf);
+ btf->start_str_off = base_btf->hdr.str_len;
+}
+
static int env_resolve_init(struct btf_verifier_env *env)
{
struct btf *btf = env->btf;
@@ -3442,10 +3464,12 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt,
goto end; \
}
-static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
+static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_type,
+ u32 field_mask, u32 *seen_mask,
int *align, int *sz)
{
int type = 0;
+ const char *name = __btf_name_by_offset(btf, var_type->name_off);
if (field_mask & BPF_SPIN_LOCK) {
if (!strcmp(name, "bpf_spin_lock")) {
@@ -3481,7 +3505,7 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
field_mask_test_name(BPF_REFCOUNT, "bpf_refcount");
/* Only return BPF_KPTR when all other types with matchable names fail */
- if (field_mask & BPF_KPTR) {
+ if (field_mask & BPF_KPTR && !__btf_type_is_struct(var_type)) {
type = BPF_KPTR_REF;
goto end;
}
@@ -3494,140 +3518,232 @@ end:
#undef field_mask_test_name
+/* Repeat a number of fields for a specified number of times.
+ *
+ * Copy the fields starting from the first field and repeat them for
+ * repeat_cnt times. The fields are repeated by adding the offset of each
+ * field with
+ * (i + 1) * elem_size
+ * where i is the repeat index and elem_size is the size of an element.
+ */
+static int btf_repeat_fields(struct btf_field_info *info,
+ u32 field_cnt, u32 repeat_cnt, u32 elem_size)
+{
+ u32 i, j;
+ u32 cur;
+
+ /* Ensure not repeating fields that should not be repeated. */
+ for (i = 0; i < field_cnt; i++) {
+ switch (info[i].type) {
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ case BPF_LIST_HEAD:
+ case BPF_RB_ROOT:
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ cur = field_cnt;
+ for (i = 0; i < repeat_cnt; i++) {
+ memcpy(&info[cur], &info[0], field_cnt * sizeof(info[0]));
+ for (j = 0; j < field_cnt; j++)
+ info[cur++].off += (i + 1) * elem_size;
+ }
+
+ return 0;
+}
+
static int btf_find_struct_field(const struct btf *btf,
const struct btf_type *t, u32 field_mask,
- struct btf_field_info *info, int info_cnt)
+ struct btf_field_info *info, int info_cnt,
+ u32 level);
+
+/* Find special fields in the struct type of a field.
+ *
+ * This function is used to find fields of special types that is not a
+ * global variable or a direct field of a struct type. It also handles the
+ * repetition if it is the element type of an array.
+ */
+static int btf_find_nested_struct(const struct btf *btf, const struct btf_type *t,
+ u32 off, u32 nelems,
+ u32 field_mask, struct btf_field_info *info,
+ int info_cnt, u32 level)
{
- int ret, idx = 0, align, sz, field_type;
- const struct btf_member *member;
+ int ret, err, i;
+
+ level++;
+ if (level >= MAX_RESOLVE_DEPTH)
+ return -E2BIG;
+
+ ret = btf_find_struct_field(btf, t, field_mask, info, info_cnt, level);
+
+ if (ret <= 0)
+ return ret;
+
+ /* Shift the offsets of the nested struct fields to the offsets
+ * related to the container.
+ */
+ for (i = 0; i < ret; i++)
+ info[i].off += off;
+
+ if (nelems > 1) {
+ err = btf_repeat_fields(info, ret, nelems - 1, t->size);
+ if (err == 0)
+ ret *= nelems;
+ else
+ ret = err;
+ }
+
+ return ret;
+}
+
+static int btf_find_field_one(const struct btf *btf,
+ const struct btf_type *var,
+ const struct btf_type *var_type,
+ int var_idx,
+ u32 off, u32 expected_size,
+ u32 field_mask, u32 *seen_mask,
+ struct btf_field_info *info, int info_cnt,
+ u32 level)
+{
+ int ret, align, sz, field_type;
struct btf_field_info tmp;
+ const struct btf_array *array;
+ u32 i, nelems = 1;
+
+ /* Walk into array types to find the element type and the number of
+ * elements in the (flattened) array.
+ */
+ for (i = 0; i < MAX_RESOLVE_DEPTH && btf_type_is_array(var_type); i++) {
+ array = btf_array(var_type);
+ nelems *= array->nelems;
+ var_type = btf_type_by_id(btf, array->type);
+ }
+ if (i == MAX_RESOLVE_DEPTH)
+ return -E2BIG;
+ if (nelems == 0)
+ return 0;
+
+ field_type = btf_get_field_type(btf, var_type,
+ field_mask, seen_mask, &align, &sz);
+ /* Look into variables of struct types */
+ if (!field_type && __btf_type_is_struct(var_type)) {
+ sz = var_type->size;
+ if (expected_size && expected_size != sz * nelems)
+ return 0;
+ ret = btf_find_nested_struct(btf, var_type, off, nelems, field_mask,
+ &info[0], info_cnt, level);
+ return ret;
+ }
+
+ if (field_type == 0)
+ return 0;
+ if (field_type < 0)
+ return field_type;
+
+ if (expected_size && expected_size != sz * nelems)
+ return 0;
+ if (off % align)
+ return 0;
+
+ switch (field_type) {
+ case BPF_SPIN_LOCK:
+ case BPF_TIMER:
+ case BPF_WORKQUEUE:
+ case BPF_LIST_NODE:
+ case BPF_RB_NODE:
+ case BPF_REFCOUNT:
+ ret = btf_find_struct(btf, var_type, off, sz, field_type,
+ info_cnt ? &info[0] : &tmp);
+ if (ret < 0)
+ return ret;
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ ret = btf_find_kptr(btf, var_type, off, sz,
+ info_cnt ? &info[0] : &tmp);
+ if (ret < 0)
+ return ret;
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_RB_ROOT:
+ ret = btf_find_graph_root(btf, var, var_type,
+ var_idx, off, sz,
+ info_cnt ? &info[0] : &tmp,
+ field_type);
+ if (ret < 0)
+ return ret;
+ break;
+ default:
+ return -EFAULT;
+ }
+
+ if (ret == BTF_FIELD_IGNORE)
+ return 0;
+ if (nelems > info_cnt)
+ return -E2BIG;
+ if (nelems > 1) {
+ ret = btf_repeat_fields(info, 1, nelems - 1, sz);
+ if (ret < 0)
+ return ret;
+ }
+ return nelems;
+}
+
+static int btf_find_struct_field(const struct btf *btf,
+ const struct btf_type *t, u32 field_mask,
+ struct btf_field_info *info, int info_cnt,
+ u32 level)
+{
+ int ret, idx = 0;
+ const struct btf_member *member;
u32 i, off, seen_mask = 0;
for_each_member(i, t, member) {
const struct btf_type *member_type = btf_type_by_id(btf,
member->type);
- field_type = btf_get_field_type(__btf_name_by_offset(btf, member_type->name_off),
- field_mask, &seen_mask, &align, &sz);
- if (field_type == 0)
- continue;
- if (field_type < 0)
- return field_type;
-
off = __btf_member_bit_offset(t, member);
if (off % 8)
/* valid C code cannot generate such BTF */
return -EINVAL;
off /= 8;
- if (off % align)
- continue;
- switch (field_type) {
- case BPF_SPIN_LOCK:
- case BPF_TIMER:
- case BPF_WORKQUEUE:
- case BPF_LIST_NODE:
- case BPF_RB_NODE:
- case BPF_REFCOUNT:
- ret = btf_find_struct(btf, member_type, off, sz, field_type,
- idx < info_cnt ? &info[idx] : &tmp);
- if (ret < 0)
- return ret;
- break;
- case BPF_KPTR_UNREF:
- case BPF_KPTR_REF:
- case BPF_KPTR_PERCPU:
- ret = btf_find_kptr(btf, member_type, off, sz,
- idx < info_cnt ? &info[idx] : &tmp);
- if (ret < 0)
- return ret;
- break;
- case BPF_LIST_HEAD:
- case BPF_RB_ROOT:
- ret = btf_find_graph_root(btf, t, member_type,
- i, off, sz,
- idx < info_cnt ? &info[idx] : &tmp,
- field_type);
- if (ret < 0)
- return ret;
- break;
- default:
- return -EFAULT;
- }
-
- if (ret == BTF_FIELD_IGNORE)
- continue;
- if (idx >= info_cnt)
- return -E2BIG;
- ++idx;
+ ret = btf_find_field_one(btf, t, member_type, i,
+ off, 0,
+ field_mask, &seen_mask,
+ &info[idx], info_cnt - idx, level);
+ if (ret < 0)
+ return ret;
+ idx += ret;
}
return idx;
}
static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
u32 field_mask, struct btf_field_info *info,
- int info_cnt)
+ int info_cnt, u32 level)
{
- int ret, idx = 0, align, sz, field_type;
+ int ret, idx = 0;
const struct btf_var_secinfo *vsi;
- struct btf_field_info tmp;
u32 i, off, seen_mask = 0;
for_each_vsi(i, t, vsi) {
const struct btf_type *var = btf_type_by_id(btf, vsi->type);
const struct btf_type *var_type = btf_type_by_id(btf, var->type);
- field_type = btf_get_field_type(__btf_name_by_offset(btf, var_type->name_off),
- field_mask, &seen_mask, &align, &sz);
- if (field_type == 0)
- continue;
- if (field_type < 0)
- return field_type;
-
off = vsi->offset;
- if (vsi->size != sz)
- continue;
- if (off % align)
- continue;
-
- switch (field_type) {
- case BPF_SPIN_LOCK:
- case BPF_TIMER:
- case BPF_WORKQUEUE:
- case BPF_LIST_NODE:
- case BPF_RB_NODE:
- case BPF_REFCOUNT:
- ret = btf_find_struct(btf, var_type, off, sz, field_type,
- idx < info_cnt ? &info[idx] : &tmp);
- if (ret < 0)
- return ret;
- break;
- case BPF_KPTR_UNREF:
- case BPF_KPTR_REF:
- case BPF_KPTR_PERCPU:
- ret = btf_find_kptr(btf, var_type, off, sz,
- idx < info_cnt ? &info[idx] : &tmp);
- if (ret < 0)
- return ret;
- break;
- case BPF_LIST_HEAD:
- case BPF_RB_ROOT:
- ret = btf_find_graph_root(btf, var, var_type,
- -1, off, sz,
- idx < info_cnt ? &info[idx] : &tmp,
- field_type);
- if (ret < 0)
- return ret;
- break;
- default:
- return -EFAULT;
- }
-
- if (ret == BTF_FIELD_IGNORE)
- continue;
- if (idx >= info_cnt)
- return -E2BIG;
- ++idx;
+ ret = btf_find_field_one(btf, var, var_type, -1, off, vsi->size,
+ field_mask, &seen_mask,
+ &info[idx], info_cnt - idx,
+ level);
+ if (ret < 0)
+ return ret;
+ idx += ret;
}
return idx;
}
@@ -3637,9 +3753,9 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t,
int info_cnt)
{
if (__btf_type_is_struct(t))
- return btf_find_struct_field(btf, t, field_mask, info, info_cnt);
+ return btf_find_struct_field(btf, t, field_mask, info, info_cnt, 0);
else if (btf_type_is_datasec(t))
- return btf_find_datasec_var(btf, t, field_mask, info, info_cnt);
+ return btf_find_datasec_var(btf, t, field_mask, info, info_cnt, 0);
return -EINVAL;
}
@@ -5726,6 +5842,15 @@ static int find_kern_ctx_type_id(enum bpf_prog_type prog_type)
return ctx_type->type;
}
+bool btf_is_projection_of(const char *pname, const char *tname)
+{
+ if (strcmp(pname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0)
+ return true;
+ if (strcmp(pname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0)
+ return true;
+ return false;
+}
+
bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, enum bpf_prog_type prog_type,
int arg)
@@ -5788,9 +5913,7 @@ again:
* int socket_filter_bpf_prog(struct __sk_buff *skb)
* { // no fields of skb are ever used }
*/
- if (strcmp(ctx_tname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0)
- return true;
- if (strcmp(ctx_tname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0)
+ if (btf_is_projection_of(ctx_tname, tname))
return true;
if (strcmp(ctx_tname, tname)) {
/* bpf_user_pt_regs_t is a typedef, so resolve it to
@@ -5982,23 +6105,15 @@ int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_ty
BTF_ID_LIST(bpf_ctx_convert_btf_id)
BTF_ID(struct, bpf_ctx_convert)
-struct btf *btf_parse_vmlinux(void)
+static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name,
+ void *data, unsigned int data_size)
{
- struct btf_verifier_env *env = NULL;
- struct bpf_verifier_log *log;
struct btf *btf = NULL;
int err;
if (!IS_ENABLED(CONFIG_DEBUG_INFO_BTF))
return ERR_PTR(-ENOENT);
- env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
- if (!env)
- return ERR_PTR(-ENOMEM);
-
- log = &env->log;
- log->level = BPF_LOG_KERNEL;
-
btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
if (!btf) {
err = -ENOMEM;
@@ -6006,10 +6121,10 @@ struct btf *btf_parse_vmlinux(void)
}
env->btf = btf;
- btf->data = __start_BTF;
- btf->data_size = __stop_BTF - __start_BTF;
+ btf->data = data;
+ btf->data_size = data_size;
btf->kernel_btf = true;
- snprintf(btf->name, sizeof(btf->name), "vmlinux");
+ snprintf(btf->name, sizeof(btf->name), "%s", name);
err = btf_parse_hdr(env);
if (err)
@@ -6029,20 +6144,11 @@ struct btf *btf_parse_vmlinux(void)
if (err)
goto errout;
- /* btf_parse_vmlinux() runs under bpf_verifier_lock */
- bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
-
refcount_set(&btf->refcnt, 1);
- err = btf_alloc_id(btf);
- if (err)
- goto errout;
-
- btf_verifier_env_free(env);
return btf;
errout:
- btf_verifier_env_free(env);
if (btf) {
kvfree(btf->types);
kfree(btf);
@@ -6050,19 +6156,61 @@ errout:
return ERR_PTR(err);
}
+struct btf *btf_parse_vmlinux(void)
+{
+ struct btf_verifier_env *env = NULL;
+ struct bpf_verifier_log *log;
+ struct btf *btf;
+ int err;
+
+ env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
+ if (!env)
+ return ERR_PTR(-ENOMEM);
+
+ log = &env->log;
+ log->level = BPF_LOG_KERNEL;
+ btf = btf_parse_base(env, "vmlinux", __start_BTF, __stop_BTF - __start_BTF);
+ if (IS_ERR(btf))
+ goto err_out;
+
+ /* btf_parse_vmlinux() runs under bpf_verifier_lock */
+ bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
+ err = btf_alloc_id(btf);
+ if (err) {
+ btf_free(btf);
+ btf = ERR_PTR(err);
+ }
+err_out:
+ btf_verifier_env_free(env);
+ return btf;
+}
+
+/* If .BTF_ids section was created with distilled base BTF, both base and
+ * split BTF ids will need to be mapped to actual base/split ids for
+ * BTF now that it has been relocated.
+ */
+static __u32 btf_relocate_id(const struct btf *btf, __u32 id)
+{
+ if (!btf->base_btf || !btf->base_id_map)
+ return id;
+ return btf->base_id_map[id];
+}
+
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
-static struct btf *btf_parse_module(const char *module_name, const void *data, unsigned int data_size)
+static struct btf *btf_parse_module(const char *module_name, const void *data,
+ unsigned int data_size, void *base_data,
+ unsigned int base_data_size)
{
+ struct btf *btf = NULL, *vmlinux_btf, *base_btf = NULL;
struct btf_verifier_env *env = NULL;
struct bpf_verifier_log *log;
- struct btf *btf = NULL, *base_btf;
- int err;
+ int err = 0;
- base_btf = bpf_get_btf_vmlinux();
- if (IS_ERR(base_btf))
- return base_btf;
- if (!base_btf)
+ vmlinux_btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(vmlinux_btf))
+ return vmlinux_btf;
+ if (!vmlinux_btf)
return ERR_PTR(-EINVAL);
env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
@@ -6072,6 +6220,16 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u
log = &env->log;
log->level = BPF_LOG_KERNEL;
+ if (base_data) {
+ base_btf = btf_parse_base(env, ".BTF.base", base_data, base_data_size);
+ if (IS_ERR(base_btf)) {
+ err = PTR_ERR(base_btf);
+ goto errout;
+ }
+ } else {
+ base_btf = vmlinux_btf;
+ }
+
btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
if (!btf) {
err = -ENOMEM;
@@ -6111,12 +6269,22 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u
if (err)
goto errout;
+ if (base_btf != vmlinux_btf) {
+ err = btf_relocate(btf, vmlinux_btf, &btf->base_id_map);
+ if (err)
+ goto errout;
+ btf_free(base_btf);
+ base_btf = vmlinux_btf;
+ }
+
btf_verifier_env_free(env);
refcount_set(&btf->refcnt, 1);
return btf;
errout:
btf_verifier_env_free(env);
+ if (base_btf != vmlinux_btf)
+ btf_free(base_btf);
if (btf) {
kvfree(btf->data);
kvfree(btf->types);
@@ -6693,7 +6861,7 @@ int btf_struct_access(struct bpf_verifier_log *log,
for (i = 0; i < rec->cnt; i++) {
struct btf_field *field = &rec->fields[i];
u32 offset = field->offset;
- if (off < offset + btf_field_type_size(field->type) && offset < off + size) {
+ if (off < offset + field->size && offset < off + size) {
bpf_log(log,
"direct access to %s is disallowed\n",
btf_field_type_name(field->type));
@@ -7370,8 +7538,8 @@ static void btf_type_show(const struct btf *btf, u32 type_id, void *obj,
btf_type_ops(t)->show(btf, t, type_id, obj, 0, show);
}
-static void btf_seq_show(struct btf_show *show, const char *fmt,
- va_list args)
+__printf(2, 0) static void btf_seq_show(struct btf_show *show, const char *fmt,
+ va_list args)
{
seq_vprintf((struct seq_file *)show->target, fmt, args);
}
@@ -7404,8 +7572,8 @@ struct btf_show_snprintf {
int len; /* length we would have written */
};
-static void btf_snprintf_show(struct btf_show *show, const char *fmt,
- va_list args)
+__printf(2, 0) static void btf_snprintf_show(struct btf_show *show, const char *fmt,
+ va_list args)
{
struct btf_show_snprintf *ssnprintf = (struct btf_show_snprintf *)show;
int len;
@@ -7669,7 +7837,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
err = -ENOMEM;
goto out;
}
- btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size);
+ btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size,
+ mod->btf_base_data, mod->btf_base_data_size);
if (IS_ERR(btf)) {
kfree(btf_mod);
if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) {
@@ -7993,7 +8162,7 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
bool add_filter = !!kset->filter;
struct btf_kfunc_set_tab *tab;
struct btf_id_set8 *set;
- u32 set_cnt;
+ u32 set_cnt, i;
int ret;
if (hook >= BTF_KFUNC_HOOK_MAX) {
@@ -8039,21 +8208,15 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
goto end;
}
- /* We don't need to allocate, concatenate, and sort module sets, because
- * only one is allowed per hook. Hence, we can directly assign the
- * pointer and return.
- */
- if (!vmlinux_set) {
- tab->sets[hook] = add_set;
- goto do_add_filter;
- }
-
/* In case of vmlinux sets, there may be more than one set being
* registered per hook. To create a unified set, we allocate a new set
* and concatenate all individual sets being registered. While each set
* is individually sorted, they may become unsorted when concatenated,
* hence re-sorting the final set again is required to make binary
* searching the set using btf_id_set8_contains function work.
+ *
+ * For module sets, we need to allocate as we may need to relocate
+ * BTF ids.
*/
set_cnt = set ? set->cnt : 0;
@@ -8083,11 +8246,14 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
/* Concatenate the two sets */
memcpy(set->pairs + set->cnt, add_set->pairs, add_set->cnt * sizeof(set->pairs[0]));
+ /* Now that the set is copied, update with relocated BTF ids */
+ for (i = set->cnt; i < set->cnt + add_set->cnt; i++)
+ set->pairs[i].id = btf_relocate_id(btf, set->pairs[i].id);
+
set->cnt += add_set->cnt;
sort(set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func, NULL);
-do_add_filter:
if (add_filter) {
hook_filter = &tab->hook_filters[hook];
hook_filter->filters[hook_filter->nr_filters++] = kset->filter;
@@ -8207,7 +8373,7 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
return PTR_ERR(btf);
for (i = 0; i < kset->set->cnt; i++) {
- ret = btf_check_kfunc_protos(btf, kset->set->pairs[i].id,
+ ret = btf_check_kfunc_protos(btf, btf_relocate_id(btf, kset->set->pairs[i].id),
kset->set->pairs[i].flags);
if (ret)
goto err_out;
@@ -8271,7 +8437,7 @@ static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc
u32 nr_args, i;
for (i = 0; i < cnt; i++) {
- dtor_btf_id = dtors[i].kfunc_btf_id;
+ dtor_btf_id = btf_relocate_id(btf, dtors[i].kfunc_btf_id);
dtor_func = btf_type_by_id(btf, dtor_btf_id);
if (!dtor_func || !btf_type_is_func(dtor_func))
@@ -8306,7 +8472,7 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c
{
struct btf_id_dtor_kfunc_tab *tab;
struct btf *btf;
- u32 tab_cnt;
+ u32 tab_cnt, i;
int ret;
btf = btf_get_module_btf(owner);
@@ -8357,6 +8523,13 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c
btf->dtor_kfunc_tab = tab;
memcpy(tab->dtors + tab->cnt, dtors, add_cnt * sizeof(tab->dtors[0]));
+
+ /* remap BTF ids based on BTF relocation (if any) */
+ for (i = tab_cnt; i < tab_cnt + add_cnt; i++) {
+ tab->dtors[i].btf_id = btf_relocate_id(btf, tab->dtors[i].btf_id);
+ tab->dtors[i].kfunc_btf_id = btf_relocate_id(btf, tab->dtors[i].kfunc_btf_id);
+ }
+
tab->cnt += add_cnt;
sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 695a0fb2cd4d..7ee62e38faf0 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1173,8 +1173,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
}
/* Copy JITed text from rw_header to its final location, the ro_header. */
-int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
- struct bpf_binary_header *ro_header,
+int bpf_jit_binary_pack_finalize(struct bpf_binary_header *ro_header,
struct bpf_binary_header *rw_header)
{
void *ptr;
@@ -2742,8 +2741,7 @@ static void bpf_free_used_maps(struct bpf_prog_aux *aux)
kfree(aux->used_maps);
}
-void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
- struct btf_mod_pair *used_btfs, u32 len)
+void __bpf_free_used_btfs(struct btf_mod_pair *used_btfs, u32 len)
{
#ifdef CONFIG_BPF_SYSCALL
struct btf_mod_pair *btf_mod;
@@ -2760,7 +2758,7 @@ void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
static void bpf_free_used_btfs(struct bpf_prog_aux *aux)
{
- __bpf_free_used_btfs(aux, aux->used_btfs, aux->used_btf_cnt);
+ __bpf_free_used_btfs(aux->used_btfs, aux->used_btf_cnt);
kfree(aux->used_btfs);
}
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index a8e34416e960..fbdf5a1aabfe 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -79,8 +79,6 @@ struct bpf_cpu_map {
struct bpf_cpu_map_entry __rcu **cpu_map;
};
-static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list);
-
static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
{
u32 value_size = attr->value_size;
@@ -240,12 +238,14 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
int xdp_n, struct xdp_cpumap_stats *stats,
struct list_head *list)
{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
int nframes;
if (!rcpu->prog)
return xdp_n;
rcu_read_lock_bh();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats);
@@ -255,6 +255,7 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
if (unlikely(!list_empty(list)))
cpu_map_bpf_prog_run_skb(rcpu, list, stats);
+ bpf_net_ctx_clear(bpf_net_ctx);
rcu_read_unlock_bh(); /* resched point, may call do_softirq() */
return nframes;
@@ -706,7 +707,6 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
*/
static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{
- struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
@@ -723,8 +723,11 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
*/
bq->q[bq->count++] = xdpf;
- if (!bq->flush_node.prev)
+ if (!bq->flush_node.prev) {
+ struct list_head *flush_list = bpf_net_ctx_get_cpu_map_flush_list();
+
list_add(&bq->flush_node, flush_list);
+ }
}
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
@@ -756,9 +759,8 @@ trace:
return ret;
}
-void __cpu_map_flush(void)
+void __cpu_map_flush(struct list_head *flush_list)
{
- struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
struct xdp_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
@@ -768,24 +770,3 @@ void __cpu_map_flush(void)
wake_up_process(bq->obj->kthread);
}
}
-
-#ifdef CONFIG_DEBUG_NET
-bool cpu_map_check_flush(void)
-{
- if (list_empty(this_cpu_ptr(&cpu_map_flush_list)))
- return false;
- __cpu_map_flush();
- return true;
-}
-#endif
-
-static int __init cpu_map_init(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu));
- return 0;
-}
-
-subsys_initcall(cpu_map_init);
diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c
index 2bee4af91e38..94854cd9c4cc 100644
--- a/kernel/bpf/crypto.c
+++ b/kernel/bpf/crypto.c
@@ -275,7 +275,7 @@ static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx,
if (__bpf_dynptr_is_rdonly(dst))
return -EINVAL;
- siv_len = __bpf_dynptr_size(siv);
+ siv_len = siv ? __bpf_dynptr_size(siv) : 0;
src_len = __bpf_dynptr_size(src);
dst_len = __bpf_dynptr_size(dst);
if (!src_len || !dst_len)
@@ -303,36 +303,44 @@ static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx,
/**
* bpf_crypto_decrypt() - Decrypt buffer using configured context and IV provided.
- * @ctx: The crypto context being used. The ctx must be a trusted pointer.
- * @src: bpf_dynptr to the encrypted data. Must be a trusted pointer.
- * @dst: bpf_dynptr to the buffer where to store the result. Must be a trusted pointer.
- * @siv: bpf_dynptr to IV data and state data to be used by decryptor.
+ * @ctx: The crypto context being used. The ctx must be a trusted pointer.
+ * @src: bpf_dynptr to the encrypted data. Must be a trusted pointer.
+ * @dst: bpf_dynptr to the buffer where to store the result. Must be a trusted pointer.
+ * @siv__nullable: bpf_dynptr to IV data and state data to be used by decryptor. May be NULL.
*
* Decrypts provided buffer using IV data and the crypto context. Crypto context must be configured.
*/
__bpf_kfunc int bpf_crypto_decrypt(struct bpf_crypto_ctx *ctx,
- const struct bpf_dynptr_kern *src,
- const struct bpf_dynptr_kern *dst,
- const struct bpf_dynptr_kern *siv)
+ const struct bpf_dynptr *src,
+ const struct bpf_dynptr *dst,
+ const struct bpf_dynptr *siv__nullable)
{
- return bpf_crypto_crypt(ctx, src, dst, siv, true);
+ const struct bpf_dynptr_kern *src_kern = (struct bpf_dynptr_kern *)src;
+ const struct bpf_dynptr_kern *dst_kern = (struct bpf_dynptr_kern *)dst;
+ const struct bpf_dynptr_kern *siv_kern = (struct bpf_dynptr_kern *)siv__nullable;
+
+ return bpf_crypto_crypt(ctx, src_kern, dst_kern, siv_kern, true);
}
/**
* bpf_crypto_encrypt() - Encrypt buffer using configured context and IV provided.
- * @ctx: The crypto context being used. The ctx must be a trusted pointer.
- * @src: bpf_dynptr to the plain data. Must be a trusted pointer.
- * @dst: bpf_dynptr to buffer where to store the result. Must be a trusted pointer.
- * @siv: bpf_dynptr to IV data and state data to be used by decryptor.
+ * @ctx: The crypto context being used. The ctx must be a trusted pointer.
+ * @src: bpf_dynptr to the plain data. Must be a trusted pointer.
+ * @dst: bpf_dynptr to the buffer where to store the result. Must be a trusted pointer.
+ * @siv__nullable: bpf_dynptr to IV data and state data to be used by decryptor. May be NULL.
*
* Encrypts provided buffer using IV data and the crypto context. Crypto context must be configured.
*/
__bpf_kfunc int bpf_crypto_encrypt(struct bpf_crypto_ctx *ctx,
- const struct bpf_dynptr_kern *src,
- const struct bpf_dynptr_kern *dst,
- const struct bpf_dynptr_kern *siv)
+ const struct bpf_dynptr *src,
+ const struct bpf_dynptr *dst,
+ const struct bpf_dynptr *siv__nullable)
{
- return bpf_crypto_crypt(ctx, src, dst, siv, false);
+ const struct bpf_dynptr_kern *src_kern = (struct bpf_dynptr_kern *)src;
+ const struct bpf_dynptr_kern *dst_kern = (struct bpf_dynptr_kern *)dst;
+ const struct bpf_dynptr_kern *siv_kern = (struct bpf_dynptr_kern *)siv__nullable;
+
+ return bpf_crypto_crypt(ctx, src_kern, dst_kern, siv_kern, false);
}
__bpf_kfunc_end_defs();
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 7f3b34452243..9e0e3b0a18e4 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -83,7 +83,6 @@ struct bpf_dtab {
u32 n_buckets;
};
-static DEFINE_PER_CPU(struct list_head, dev_flush_list);
static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list);
@@ -107,7 +106,7 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)];
}
-static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
+static int dev_map_alloc_check(union bpf_attr *attr)
{
u32 valsize = attr->value_size;
@@ -121,23 +120,28 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
attr->map_flags & ~DEV_CREATE_FLAG_MASK)
return -EINVAL;
+ if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+ /* Hash table size must be power of 2; roundup_pow_of_two()
+ * can overflow into UB on 32-bit arches
+ */
+ if (attr->max_entries > 1UL << 31)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
+{
/* Lookup returns a pointer straight to dev->ifindex, so make sure the
* verifier prevents writes from the BPF side
*/
attr->map_flags |= BPF_F_RDONLY_PROG;
-
-
bpf_map_init_from_attr(&dtab->map, attr);
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
- /* hash table size must be power of 2; roundup_pow_of_two() can
- * overflow into UB on 32-bit arches, so check that first
- */
- if (dtab->map.max_entries > 1UL << 31)
- return -EINVAL;
-
+ /* Hash table size must be power of 2 */
dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
-
dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets,
dtab->map.numa_node);
if (!dtab->dev_index_head)
@@ -196,7 +200,14 @@ static void dev_map_free(struct bpf_map *map)
list_del_rcu(&dtab->list);
spin_unlock(&dev_map_lock);
- bpf_clear_redirect_map(map);
+ /* bpf_redirect_info->map is assigned in __bpf_xdp_redirect_map()
+ * during NAPI callback and cleared after the XDP redirect. There is no
+ * explicit RCU read section which protects bpf_redirect_info->map but
+ * local_bh_disable() also marks the beginning an RCU section. This
+ * makes the complete softirq callback RCU protected. Thus after
+ * following synchronize_rcu() there no bpf_redirect_info->map == map
+ * assignment.
+ */
synchronize_rcu();
/* Make sure prior __dev_map_entry_free() have completed. */
@@ -406,9 +417,8 @@ out:
* driver before returning from its napi->poll() routine. See the comment above
* xdp_do_flush() in filter.c.
*/
-void __dev_flush(void)
+void __dev_flush(struct list_head *flush_list)
{
- struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct xdp_dev_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
@@ -419,16 +429,6 @@ void __dev_flush(void)
}
}
-#ifdef CONFIG_DEBUG_NET
-bool dev_check_flush(void)
-{
- if (list_empty(this_cpu_ptr(&dev_flush_list)))
- return false;
- __dev_flush();
- return true;
-}
-#endif
-
/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
* by local_bh_disable() (from XDP calls inside NAPI). The
* rcu_read_lock_bh_held() below makes lockdep accept both.
@@ -453,7 +453,6 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx, struct bpf_prog *xdp_prog)
{
- struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
@@ -467,6 +466,8 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
* are only ever modified together.
*/
if (!bq->dev_rx) {
+ struct list_head *flush_list = bpf_net_ctx_get_dev_flush_list();
+
bq->dev_rx = dev_rx;
bq->xdp_prog = xdp_prog;
list_add(&bq->flush_node, flush_list);
@@ -1040,6 +1041,7 @@ static u64 dev_map_mem_usage(const struct bpf_map *map)
BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab)
const struct bpf_map_ops dev_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = dev_map_alloc_check,
.map_alloc = dev_map_alloc,
.map_free = dev_map_free,
.map_get_next_key = dev_map_get_next_key,
@@ -1054,6 +1056,7 @@ const struct bpf_map_ops dev_map_ops = {
const struct bpf_map_ops dev_map_hash_ops = {
.map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = dev_map_alloc_check,
.map_alloc = dev_map_alloc,
.map_free = dev_map_free,
.map_get_next_key = dev_map_hash_get_next_key,
@@ -1153,15 +1156,11 @@ static struct notifier_block dev_map_notifier = {
static int __init dev_map_init(void)
{
- int cpu;
-
/* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
offsetof(struct _bpf_dtab_netdev, dev));
register_netdevice_notifier(&dev_map_notifier);
- for_each_possible_cpu(cpu)
- INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu));
return 0;
}
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 2a69a9a36c0f..b5f0adae8293 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1084,7 +1084,10 @@ struct bpf_async_cb {
struct bpf_prog *prog;
void __rcu *callback_fn;
void *value;
- struct rcu_head rcu;
+ union {
+ struct rcu_head rcu;
+ struct work_struct delete_work;
+ };
u64 flags;
};
@@ -1107,6 +1110,7 @@ struct bpf_async_cb {
struct bpf_hrtimer {
struct bpf_async_cb cb;
struct hrtimer timer;
+ atomic_t cancelling;
};
struct bpf_work {
@@ -1219,6 +1223,21 @@ static void bpf_wq_delete_work(struct work_struct *work)
kfree_rcu(w, cb.rcu);
}
+static void bpf_timer_delete_work(struct work_struct *work)
+{
+ struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, cb.delete_work);
+
+ /* Cancel the timer and wait for callback to complete if it was running.
+ * If hrtimer_cancel() can be safely called it's safe to call
+ * kfree_rcu(t) right after for both preallocated and non-preallocated
+ * maps. The async->cb = NULL was already done and no code path can see
+ * address 't' anymore. Timer if armed for existing bpf_hrtimer before
+ * bpf_timer_cancel_and_free will have been cancelled.
+ */
+ hrtimer_cancel(&t->timer);
+ kfree_rcu(t, cb.rcu);
+}
+
static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
enum bpf_async_type type)
{
@@ -1262,6 +1281,8 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
clockid = flags & (MAX_CLOCKS - 1);
t = (struct bpf_hrtimer *)cb;
+ atomic_set(&t->cancelling, 0);
+ INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work);
hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
t->timer.function = bpf_timer_cb;
cb->value = (void *)async - map->record->timer_off;
@@ -1440,7 +1461,8 @@ static void drop_prog_refcnt(struct bpf_async_cb *async)
BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
{
- struct bpf_hrtimer *t;
+ struct bpf_hrtimer *t, *cur_t;
+ bool inc = false;
int ret = 0;
if (in_nmi())
@@ -1452,14 +1474,41 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
ret = -EINVAL;
goto out;
}
- if (this_cpu_read(hrtimer_running) == t) {
+
+ cur_t = this_cpu_read(hrtimer_running);
+ if (cur_t == t) {
/* If bpf callback_fn is trying to bpf_timer_cancel()
* its own timer the hrtimer_cancel() will deadlock
- * since it waits for callback_fn to finish
+ * since it waits for callback_fn to finish.
+ */
+ ret = -EDEADLK;
+ goto out;
+ }
+
+ /* Only account in-flight cancellations when invoked from a timer
+ * callback, since we want to avoid waiting only if other _callbacks_
+ * are waiting on us, to avoid introducing lockups. Non-callback paths
+ * are ok, since nobody would synchronously wait for their completion.
+ */
+ if (!cur_t)
+ goto drop;
+ atomic_inc(&t->cancelling);
+ /* Need full barrier after relaxed atomic_inc */
+ smp_mb__after_atomic();
+ inc = true;
+ if (atomic_read(&cur_t->cancelling)) {
+ /* We're cancelling timer t, while some other timer callback is
+ * attempting to cancel us. In such a case, it might be possible
+ * that timer t belongs to the other callback, or some other
+ * callback waiting upon it (creating transitive dependencies
+ * upon us), and we will enter a deadlock if we continue
+ * cancelling and waiting for it synchronously, since it might
+ * do the same. Bail!
*/
ret = -EDEADLK;
goto out;
}
+drop:
drop_prog_refcnt(&t->cb);
out:
__bpf_spin_unlock_irqrestore(&timer->lock);
@@ -1467,6 +1516,8 @@ out:
* if it was running.
*/
ret = ret ?: hrtimer_cancel(&t->timer);
+ if (inc)
+ atomic_dec(&t->cancelling);
rcu_read_unlock();
return ret;
}
@@ -1512,25 +1563,39 @@ void bpf_timer_cancel_and_free(void *val)
if (!t)
return;
- /* Cancel the timer and wait for callback to complete if it was running.
- * If hrtimer_cancel() can be safely called it's safe to call kfree(t)
- * right after for both preallocated and non-preallocated maps.
- * The async->cb = NULL was already done and no code path can
- * see address 't' anymore.
- *
- * Check that bpf_map_delete/update_elem() wasn't called from timer
- * callback_fn. In such case don't call hrtimer_cancel() (since it will
- * deadlock) and don't call hrtimer_try_to_cancel() (since it will just
- * return -1). Though callback_fn is still running on this cpu it's
+ /* We check that bpf_map_delete/update_elem() was called from timer
+ * callback_fn. In such case we don't call hrtimer_cancel() (since it
+ * will deadlock) and don't call hrtimer_try_to_cancel() (since it will
+ * just return -1). Though callback_fn is still running on this cpu it's
* safe to do kfree(t) because bpf_timer_cb() read everything it needed
* from 't'. The bpf subprog callback_fn won't be able to access 't',
* since async->cb = NULL was already done. The timer will be
* effectively cancelled because bpf_timer_cb() will return
* HRTIMER_NORESTART.
+ *
+ * However, it is possible the timer callback_fn calling us armed the
+ * timer _before_ calling us, such that failing to cancel it here will
+ * cause it to possibly use struct hrtimer after freeing bpf_hrtimer.
+ * Therefore, we _need_ to cancel any outstanding timers before we do
+ * kfree_rcu, even though no more timers can be armed.
+ *
+ * Moreover, we need to schedule work even if timer does not belong to
+ * the calling callback_fn, as on two different CPUs, we can end up in a
+ * situation where both sides run in parallel, try to cancel one
+ * another, and we end up waiting on both sides in hrtimer_cancel
+ * without making forward progress, since timer1 depends on time2
+ * callback to finish, and vice versa.
+ *
+ * CPU 1 (timer1_cb) CPU 2 (timer2_cb)
+ * bpf_timer_cancel_and_free(timer2) bpf_timer_cancel_and_free(timer1)
+ *
+ * To avoid these issues, punt to workqueue context when we are in a
+ * timer callback.
*/
- if (this_cpu_read(hrtimer_running) != t)
- hrtimer_cancel(&t->timer);
- kfree_rcu(t, cb.rcu);
+ if (this_cpu_read(hrtimer_running))
+ queue_work(system_unbound_wq, &t->cb.delete_work);
+ else
+ bpf_timer_delete_work(&t->cb.delete_work);
}
/* This function is called by map_delete/update_elem for individual element and
@@ -2433,7 +2498,7 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
/**
* bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data.
- * @ptr: The dynptr whose data slice to retrieve
+ * @p: The dynptr whose data slice to retrieve
* @offset: Offset into the dynptr
* @buffer__opt: User-provided buffer to copy contents into. May be NULL
* @buffer__szk: Size (in bytes) of the buffer if present. This is the
@@ -2459,9 +2524,10 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
* provided buffer, with its contents containing the data, if unable to obtain
* direct pointer)
*/
-__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset,
+__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset,
void *buffer__opt, u32 buffer__szk)
{
+ const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
enum bpf_dynptr_type type;
u32 len = buffer__szk;
int err;
@@ -2503,7 +2569,7 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
/**
* bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data.
- * @ptr: The dynptr whose data slice to retrieve
+ * @p: The dynptr whose data slice to retrieve
* @offset: Offset into the dynptr
* @buffer__opt: User-provided buffer to copy contents into. May be NULL
* @buffer__szk: Size (in bytes) of the buffer if present. This is the
@@ -2543,9 +2609,11 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
* provided buffer, with its contents containing the data, if unable to obtain
* direct pointer)
*/
-__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset,
+__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset,
void *buffer__opt, u32 buffer__szk)
{
+ const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
if (!ptr->data || __bpf_dynptr_is_rdonly(ptr))
return NULL;
@@ -2571,11 +2639,12 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 o
* will be copied out into the buffer and the user will need to call
* bpf_dynptr_write() to commit changes.
*/
- return bpf_dynptr_slice(ptr, offset, buffer__opt, buffer__szk);
+ return bpf_dynptr_slice(p, offset, buffer__opt, buffer__szk);
}
-__bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 end)
+__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u32 start, u32 end)
{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
u32 size;
if (!ptr->data || start > end)
@@ -2592,36 +2661,45 @@ __bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 en
return 0;
}
-__bpf_kfunc bool bpf_dynptr_is_null(struct bpf_dynptr_kern *ptr)
+__bpf_kfunc bool bpf_dynptr_is_null(const struct bpf_dynptr *p)
{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
return !ptr->data;
}
-__bpf_kfunc bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr)
+__bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p)
{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
if (!ptr->data)
return false;
return __bpf_dynptr_is_rdonly(ptr);
}
-__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
+__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr *p)
{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
if (!ptr->data)
return -EINVAL;
return __bpf_dynptr_size(ptr);
}
-__bpf_kfunc int bpf_dynptr_clone(struct bpf_dynptr_kern *ptr,
- struct bpf_dynptr_kern *clone__uninit)
+__bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p,
+ struct bpf_dynptr *clone__uninit)
{
+ struct bpf_dynptr_kern *clone = (struct bpf_dynptr_kern *)clone__uninit;
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
if (!ptr->data) {
- bpf_dynptr_set_null(clone__uninit);
+ bpf_dynptr_set_null(clone);
return -EINVAL;
}
- *clone__uninit = *ptr;
+ *clone = *ptr;
return 0;
}
@@ -2721,7 +2799,7 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags)
}
__bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq,
- int (callback_fn)(void *map, int *key, struct bpf_wq *wq),
+ int (callback_fn)(void *map, int *key, void *value),
unsigned int flags,
void *aux__ign)
{
@@ -2744,6 +2822,122 @@ __bpf_kfunc void bpf_preempt_enable(void)
preempt_enable();
}
+struct bpf_iter_bits {
+ __u64 __opaque[2];
+} __aligned(8);
+
+struct bpf_iter_bits_kern {
+ union {
+ unsigned long *bits;
+ unsigned long bits_copy;
+ };
+ u32 nr_bits;
+ int bit;
+} __aligned(8);
+
+/**
+ * bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area
+ * @it: The new bpf_iter_bits to be created
+ * @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over
+ * @nr_words: The size of the specified memory area, measured in 8-byte units.
+ * Due to the limitation of memalloc, it can't be greater than 512.
+ *
+ * This function initializes a new bpf_iter_bits structure for iterating over
+ * a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It
+ * copies the data of the memory area to the newly created bpf_iter_bits @it for
+ * subsequent iteration operations.
+ *
+ * On success, 0 is returned. On failure, ERR is returned.
+ */
+__bpf_kfunc int
+bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_words)
+{
+ struct bpf_iter_bits_kern *kit = (void *)it;
+ u32 nr_bytes = nr_words * sizeof(u64);
+ u32 nr_bits = BYTES_TO_BITS(nr_bytes);
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_bits_kern) != sizeof(struct bpf_iter_bits));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_bits_kern) !=
+ __alignof__(struct bpf_iter_bits));
+
+ kit->nr_bits = 0;
+ kit->bits_copy = 0;
+ kit->bit = -1;
+
+ if (!unsafe_ptr__ign || !nr_words)
+ return -EINVAL;
+
+ /* Optimization for u64 mask */
+ if (nr_bits == 64) {
+ err = bpf_probe_read_kernel_common(&kit->bits_copy, nr_bytes, unsafe_ptr__ign);
+ if (err)
+ return -EFAULT;
+
+ kit->nr_bits = nr_bits;
+ return 0;
+ }
+
+ /* Fallback to memalloc */
+ kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes);
+ if (!kit->bits)
+ return -ENOMEM;
+
+ err = bpf_probe_read_kernel_common(kit->bits, nr_bytes, unsafe_ptr__ign);
+ if (err) {
+ bpf_mem_free(&bpf_global_ma, kit->bits);
+ return err;
+ }
+
+ kit->nr_bits = nr_bits;
+ return 0;
+}
+
+/**
+ * bpf_iter_bits_next() - Get the next bit in a bpf_iter_bits
+ * @it: The bpf_iter_bits to be checked
+ *
+ * This function returns a pointer to a number representing the value of the
+ * next bit in the bits.
+ *
+ * If there are no further bits available, it returns NULL.
+ */
+__bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it)
+{
+ struct bpf_iter_bits_kern *kit = (void *)it;
+ u32 nr_bits = kit->nr_bits;
+ const unsigned long *bits;
+ int bit;
+
+ if (nr_bits == 0)
+ return NULL;
+
+ bits = nr_bits == 64 ? &kit->bits_copy : kit->bits;
+ bit = find_next_bit(bits, nr_bits, kit->bit + 1);
+ if (bit >= nr_bits) {
+ kit->nr_bits = 0;
+ return NULL;
+ }
+
+ kit->bit = bit;
+ return &kit->bit;
+}
+
+/**
+ * bpf_iter_bits_destroy() - Destroy a bpf_iter_bits
+ * @it: The bpf_iter_bits to be destroyed
+ *
+ * Destroy the resource associated with the bpf_iter_bits.
+ */
+__bpf_kfunc void bpf_iter_bits_destroy(struct bpf_iter_bits *it)
+{
+ struct bpf_iter_bits_kern *kit = (void *)it;
+
+ if (kit->nr_bits <= 64)
+ return;
+ bpf_mem_free(&bpf_global_ma, kit->bits);
+}
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(generic_btf_ids)
@@ -2826,6 +3020,9 @@ BTF_ID_FLAGS(func, bpf_wq_set_callback_impl)
BTF_ID_FLAGS(func, bpf_wq_start)
BTF_ID_FLAGS(func, bpf_preempt_disable)
BTF_ID_FLAGS(func, bpf_preempt_enable)
+BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW)
+BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY)
BTF_KFUNCS_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
@@ -2867,7 +3064,9 @@ late_initcall(kfunc_init);
*/
const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len)
{
- return bpf_dynptr_slice(ptr, 0, NULL, len);
+ const struct bpf_dynptr *p = (struct bpf_dynptr *)ptr;
+
+ return bpf_dynptr_slice(p, 0, NULL, len);
}
/* Get a pointer to dynptr data up to len bytes for read write access. If
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 4bd8f17a9f24..5aebfc3051e3 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -91,7 +91,7 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
goto fail;
} else {
u64 new_end, new_start;
- u32 buf_start, buf_end, new_n;
+ u32 buf_start, buf_end;
new_end = log->end_pos + n;
if (new_end - log->start_pos >= log->len_total)
@@ -708,7 +708,9 @@ static void print_reg_state(struct bpf_verifier_env *env,
verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id));
verbose(env, "(");
if (reg->id)
- verbose_a("id=%d", reg->id);
+ verbose_a("id=%d", reg->id & ~BPF_ADD_CONST);
+ if (reg->id & BPF_ADD_CONST)
+ verbose(env, "%+d", reg->off);
if (reg->ref_obj_id)
verbose_a("ref_obj_id=%d", reg->ref_obj_id);
if (type_is_non_owning_ref(reg->type))
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index a546aba46d5d..dec892ded031 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -155,12 +155,9 @@ static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags)
static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
{
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
if (c->objcg)
return get_mem_cgroup_from_objcg(c->objcg);
-#endif
-
-#ifdef CONFIG_MEMCG
return root_mem_cgroup;
#else
return NULL;
@@ -534,7 +531,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
size += LLIST_NODE_SZ; /* room for llist_node */
unit_size = size;
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
if (memcg_bpf_enabled())
objcg = get_obj_cgroup_from_current();
#endif
@@ -556,7 +553,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
if (!pcc)
return -ENOMEM;
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
objcg = get_obj_cgroup_from_current();
#endif
ma->objcg = objcg;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f45ed6adc092..bf6c5f685ea2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -385,7 +385,7 @@ void bpf_map_free_id(struct bpf_map *map)
spin_unlock_irqrestore(&map_idr_lock, flags);
}
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
static void bpf_map_save_memcg(struct bpf_map *map)
{
/* Currently if a map is created by a process belonging to the root
@@ -486,7 +486,7 @@ int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
unsigned long i, j;
struct page *pg;
int ret = 0;
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
struct mem_cgroup *memcg, *old_memcg;
memcg = bpf_map_get_memcg(map);
@@ -505,7 +505,7 @@ int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
break;
}
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
set_active_memcg(old_memcg);
mem_cgroup_put(memcg);
#endif
@@ -3151,6 +3151,13 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
}
#endif
+static __poll_t bpf_link_poll(struct file *file, struct poll_table_struct *pts)
+{
+ struct bpf_link *link = file->private_data;
+
+ return link->ops->poll(file, pts);
+}
+
static const struct file_operations bpf_link_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_link_show_fdinfo,
@@ -3160,6 +3167,16 @@ static const struct file_operations bpf_link_fops = {
.write = bpf_dummy_write,
};
+static const struct file_operations bpf_link_fops_poll = {
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = bpf_link_show_fdinfo,
+#endif
+ .release = bpf_link_release,
+ .read = bpf_dummy_read,
+ .write = bpf_dummy_write,
+ .poll = bpf_link_poll,
+};
+
static int bpf_link_alloc_id(struct bpf_link *link)
{
int id;
@@ -3202,7 +3219,9 @@ int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
return id;
}
- file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
+ file = anon_inode_getfile("bpf_link",
+ link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops,
+ link, O_CLOEXEC);
if (IS_ERR(file)) {
bpf_link_free_id(id);
put_unused_fd(fd);
@@ -3230,7 +3249,9 @@ int bpf_link_settle(struct bpf_link_primer *primer)
int bpf_link_new_fd(struct bpf_link *link)
{
- return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
+ return anon_inode_getfd("bpf-link",
+ link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops,
+ link, O_CLOEXEC);
}
struct bpf_link *bpf_link_get_from_fd(u32 ufd)
@@ -3240,7 +3261,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd)
if (!f.file)
return ERR_PTR(-EBADF);
- if (f.file->f_op != &bpf_link_fops) {
+ if (f.file->f_op != &bpf_link_fops && f.file->f_op != &bpf_link_fops_poll) {
fdput(f);
return ERR_PTR(-EINVAL);
}
@@ -4972,7 +4993,7 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
uattr);
else if (f.file->f_op == &btf_fops)
err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr);
- else if (f.file->f_op == &bpf_link_fops)
+ else if (f.file->f_op == &bpf_link_fops || f.file->f_op == &bpf_link_fops_poll)
err = bpf_link_get_info_by_fd(f.file, f.file->private_data,
attr, uattr);
else
@@ -5107,7 +5128,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
if (!file)
return -EBADF;
- if (file->f_op == &bpf_link_fops) {
+ if (file->f_op == &bpf_link_fops || file->f_op == &bpf_link_fops_poll) {
struct bpf_link *link = file->private_data;
if (link->ops == &bpf_raw_tp_link_lops) {
@@ -5417,10 +5438,11 @@ static int link_detach(union bpf_attr *attr)
return ret;
}
-static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
+struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
{
return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT);
}
+EXPORT_SYMBOL(bpf_link_inc_not_zero);
struct bpf_link *bpf_link_by_id(u32 id)
{
@@ -5961,7 +5983,7 @@ const struct bpf_prog_ops bpf_syscall_prog_ops = {
};
#ifdef CONFIG_SYSCTL
-static int bpf_stats_handler(struct ctl_table *table, int write,
+static int bpf_stats_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct static_key *key = (struct static_key *)table->data;
@@ -5996,7 +6018,7 @@ void __weak unpriv_ebpf_notify(int new_state)
{
}
-static int bpf_unpriv_handler(struct ctl_table *table, int write,
+static int bpf_unpriv_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret, unpriv_enable = *(int *)table->data;
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index ec4e97c61eef..02aa9db8d796 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -261,6 +261,7 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
u32 saved_tid = info->tid;
struct task_struct *curr_task;
unsigned int curr_fd = info->fd;
+ struct file *f;
/* If this function returns a non-NULL file object,
* it held a reference to the task/file.
@@ -286,12 +287,8 @@ again:
}
rcu_read_lock();
- for (;; curr_fd++) {
- struct file *f;
- f = task_lookup_next_fdget_rcu(curr_task, &curr_fd);
- if (!f)
- break;
-
+ f = task_lookup_next_fdget_rcu(curr_task, &curr_fd);
+ if (f) {
/* set info->fd */
info->fd = curr_fd;
rcu_read_unlock();
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 214a9fa8c6fb..4cb5441ad75f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2982,8 +2982,10 @@ static int check_subprogs(struct bpf_verifier_env *env)
if (code == (BPF_JMP | BPF_CALL) &&
insn[i].src_reg == 0 &&
- insn[i].imm == BPF_FUNC_tail_call)
+ insn[i].imm == BPF_FUNC_tail_call) {
subprog[cur_subprog].has_tail_call = true;
+ subprog[cur_subprog].tail_call_reachable = true;
+ }
if (BPF_CLASS(code) == BPF_LD &&
(BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND))
subprog[cur_subprog].has_ld_abs = true;
@@ -3215,7 +3217,8 @@ static int insn_def_regno(const struct bpf_insn *insn)
case BPF_ST:
return -1;
case BPF_STX:
- if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ if ((BPF_MODE(insn->code) == BPF_ATOMIC ||
+ BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) &&
(insn->imm & BPF_FETCH)) {
if (insn->imm == BPF_CMPXCHG)
return BPF_REG_0;
@@ -3991,7 +3994,7 @@ static bool idset_contains(struct bpf_idset *s, u32 id)
u32 i;
for (i = 0; i < s->count; ++i)
- if (s->ids[i] == id)
+ if (s->ids[i] == (id & ~BPF_ADD_CONST))
return true;
return false;
@@ -4001,7 +4004,7 @@ static int idset_push(struct bpf_idset *s, u32 id)
{
if (WARN_ON_ONCE(s->count >= ARRAY_SIZE(s->ids)))
return -EFAULT;
- s->ids[s->count++] = id;
+ s->ids[s->count++] = id & ~BPF_ADD_CONST;
return 0;
}
@@ -4438,8 +4441,20 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
static void assign_scalar_id_before_mov(struct bpf_verifier_env *env,
struct bpf_reg_state *src_reg)
{
- if (src_reg->type == SCALAR_VALUE && !src_reg->id &&
- !tnum_is_const(src_reg->var_off))
+ if (src_reg->type != SCALAR_VALUE)
+ return;
+
+ if (src_reg->id & BPF_ADD_CONST) {
+ /*
+ * The verifier is processing rX = rY insn and
+ * rY->id has special linked register already.
+ * Cleared it, since multiple rX += const are not supported.
+ */
+ src_reg->id = 0;
+ src_reg->off = 0;
+ }
+
+ if (!src_reg->id && !tnum_is_const(src_reg->var_off))
/* Ensure that src_reg has a valid ID that will be copied to
* dst_reg and then will be used by find_equal_scalars() to
* propagate min/max range.
@@ -5449,7 +5464,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
* this program. To check that [x1, x2) overlaps with [y1, y2),
* it is sufficient to check x1 < y2 && y1 < x2.
*/
- if (reg->smin_value + off < p + btf_field_type_size(field->type) &&
+ if (reg->smin_value + off < p + field->size &&
p < reg->umax_value + off + size) {
switch (field->type) {
case BPF_KPTR_UNREF:
@@ -7715,6 +7730,13 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
int err;
+ if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) {
+ verbose(env,
+ "arg#%d expected pointer to stack or const struct bpf_dynptr\n",
+ regno);
+ return -EINVAL;
+ }
+
/* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an
* ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
*/
@@ -9464,6 +9486,10 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
return -EINVAL;
}
} else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
+ ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_DYNPTR);
+ if (ret)
+ return ret;
+
ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0);
if (ret)
return ret;
@@ -10917,7 +10943,7 @@ enum {
};
BTF_ID_LIST(kf_arg_btf_ids)
-BTF_ID(struct, bpf_dynptr_kern)
+BTF_ID(struct, bpf_dynptr)
BTF_ID(struct, bpf_list_head)
BTF_ID(struct, bpf_list_node)
BTF_ID(struct, bpf_rb_root)
@@ -11190,6 +11216,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))
return KF_ARG_PTR_TO_CTX;
+ if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg))
+ return KF_ARG_PTR_TO_NULL;
+
if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_ALLOC_BTF_ID;
@@ -11235,9 +11264,6 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))
return KF_ARG_PTR_TO_CALLBACK;
- if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg))
- return KF_ARG_PTR_TO_NULL;
-
if (argno + 1 < nargs &&
(is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]) ||
is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1])))
@@ -11268,6 +11294,8 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
bool strict_type_match = false;
const struct btf *reg_btf;
const char *reg_ref_tname;
+ bool taking_projection;
+ bool struct_same;
u32 reg_ref_id;
if (base_type(reg->type) == PTR_TO_BTF_ID) {
@@ -11307,11 +11335,19 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id))
strict_type_match = true;
- WARN_ON_ONCE(is_kfunc_trusted_args(meta) && reg->off);
+ WARN_ON_ONCE(is_kfunc_release(meta) &&
+ (reg->off || !tnum_is_const(reg->var_off) ||
+ reg->var_off.value));
reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, &reg_ref_id);
reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off);
- if (!btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match)) {
+ struct_same = btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match);
+ /* If kfunc is accepting a projection type (ie. __sk_buff), it cannot
+ * actually use it -- it must cast to the underlying type. So we allow
+ * caller to pass in the underlying type.
+ */
+ taking_projection = btf_is_projection_of(ref_tname, reg_ref_tname);
+ if (!taking_projection && !struct_same) {
verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1,
btf_type_str(reg_ref_t), reg_ref_tname);
@@ -11651,7 +11687,7 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env,
node_off = reg->off + reg->var_off.value;
field = reg_find_field_offset(reg, node_off, node_field_type);
- if (!field || field->offset != node_off) {
+ if (!field) {
verbose(env, "%s not found at offset=%u\n", node_type_name, node_off);
return -EINVAL;
}
@@ -11883,12 +11919,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return -EINVAL;
}
}
-
fallthrough;
case KF_ARG_PTR_TO_CTX:
- /* Trusted arguments have the same offset checks as release arguments */
- arg_type |= OBJ_RELEASE;
- break;
case KF_ARG_PTR_TO_DYNPTR:
case KF_ARG_PTR_TO_ITER:
case KF_ARG_PTR_TO_LIST_HEAD:
@@ -11901,7 +11933,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
case KF_ARG_PTR_TO_CONST_STR:
case KF_ARG_PTR_TO_WORKQUEUE:
- /* Trusted by default */
break;
default:
WARN_ON_ONCE(1);
@@ -11957,12 +11988,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR;
int clone_ref_obj_id = 0;
- if (reg->type != PTR_TO_STACK &&
- reg->type != CONST_PTR_TO_DYNPTR) {
- verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i);
- return -EINVAL;
- }
-
if (reg->type == CONST_PTR_TO_DYNPTR)
dynptr_arg_type |= MEM_RDONLY;
@@ -12701,56 +12726,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return 0;
}
-static bool signed_add_overflows(s64 a, s64 b)
-{
- /* Do the add in u64, where overflow is well-defined */
- s64 res = (s64)((u64)a + (u64)b);
-
- if (b < 0)
- return res > a;
- return res < a;
-}
-
-static bool signed_add32_overflows(s32 a, s32 b)
-{
- /* Do the add in u32, where overflow is well-defined */
- s32 res = (s32)((u32)a + (u32)b);
-
- if (b < 0)
- return res > a;
- return res < a;
-}
-
-static bool signed_add16_overflows(s16 a, s16 b)
-{
- /* Do the add in u16, where overflow is well-defined */
- s16 res = (s16)((u16)a + (u16)b);
-
- if (b < 0)
- return res > a;
- return res < a;
-}
-
-static bool signed_sub_overflows(s64 a, s64 b)
-{
- /* Do the sub in u64, where overflow is well-defined */
- s64 res = (s64)((u64)a - (u64)b);
-
- if (b < 0)
- return res < a;
- return res > a;
-}
-
-static bool signed_sub32_overflows(s32 a, s32 b)
-{
- /* Do the sub in u32, where overflow is well-defined */
- s32 res = (s32)((u32)a - (u32)b);
-
- if (b < 0)
- return res < a;
- return res > a;
-}
-
static bool check_reg_sane_offset(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
enum bpf_reg_type type)
@@ -13232,21 +13207,15 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
* added into the variable offset, and we copy the fixed offset
* from ptr_reg.
*/
- if (signed_add_overflows(smin_ptr, smin_val) ||
- signed_add_overflows(smax_ptr, smax_val)) {
+ if (check_add_overflow(smin_ptr, smin_val, &dst_reg->smin_value) ||
+ check_add_overflow(smax_ptr, smax_val, &dst_reg->smax_value)) {
dst_reg->smin_value = S64_MIN;
dst_reg->smax_value = S64_MAX;
- } else {
- dst_reg->smin_value = smin_ptr + smin_val;
- dst_reg->smax_value = smax_ptr + smax_val;
}
- if (umin_ptr + umin_val < umin_ptr ||
- umax_ptr + umax_val < umax_ptr) {
+ if (check_add_overflow(umin_ptr, umin_val, &dst_reg->umin_value) ||
+ check_add_overflow(umax_ptr, umax_val, &dst_reg->umax_value)) {
dst_reg->umin_value = 0;
dst_reg->umax_value = U64_MAX;
- } else {
- dst_reg->umin_value = umin_ptr + umin_val;
- dst_reg->umax_value = umax_ptr + umax_val;
}
dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
dst_reg->off = ptr_reg->off;
@@ -13289,14 +13258,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
/* A new variable offset is created. If the subtrahend is known
* nonnegative, then any reg->range we had before is still good.
*/
- if (signed_sub_overflows(smin_ptr, smax_val) ||
- signed_sub_overflows(smax_ptr, smin_val)) {
+ if (check_sub_overflow(smin_ptr, smax_val, &dst_reg->smin_value) ||
+ check_sub_overflow(smax_ptr, smin_val, &dst_reg->smax_value)) {
/* Overflow possible, we know nothing */
dst_reg->smin_value = S64_MIN;
dst_reg->smax_value = S64_MAX;
- } else {
- dst_reg->smin_value = smin_ptr - smax_val;
- dst_reg->smax_value = smax_ptr - smin_val;
}
if (umin_ptr < umax_val) {
/* Overflow possible, we know nothing */
@@ -13349,71 +13315,56 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
static void scalar32_min_max_add(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s32 smin_val = src_reg->s32_min_value;
- s32 smax_val = src_reg->s32_max_value;
- u32 umin_val = src_reg->u32_min_value;
- u32 umax_val = src_reg->u32_max_value;
+ s32 *dst_smin = &dst_reg->s32_min_value;
+ s32 *dst_smax = &dst_reg->s32_max_value;
+ u32 *dst_umin = &dst_reg->u32_min_value;
+ u32 *dst_umax = &dst_reg->u32_max_value;
- if (signed_add32_overflows(dst_reg->s32_min_value, smin_val) ||
- signed_add32_overflows(dst_reg->s32_max_value, smax_val)) {
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
- } else {
- dst_reg->s32_min_value += smin_val;
- dst_reg->s32_max_value += smax_val;
+ if (check_add_overflow(*dst_smin, src_reg->s32_min_value, dst_smin) ||
+ check_add_overflow(*dst_smax, src_reg->s32_max_value, dst_smax)) {
+ *dst_smin = S32_MIN;
+ *dst_smax = S32_MAX;
}
- if (dst_reg->u32_min_value + umin_val < umin_val ||
- dst_reg->u32_max_value + umax_val < umax_val) {
- dst_reg->u32_min_value = 0;
- dst_reg->u32_max_value = U32_MAX;
- } else {
- dst_reg->u32_min_value += umin_val;
- dst_reg->u32_max_value += umax_val;
+ if (check_add_overflow(*dst_umin, src_reg->u32_min_value, dst_umin) ||
+ check_add_overflow(*dst_umax, src_reg->u32_max_value, dst_umax)) {
+ *dst_umin = 0;
+ *dst_umax = U32_MAX;
}
}
static void scalar_min_max_add(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s64 smin_val = src_reg->smin_value;
- s64 smax_val = src_reg->smax_value;
- u64 umin_val = src_reg->umin_value;
- u64 umax_val = src_reg->umax_value;
+ s64 *dst_smin = &dst_reg->smin_value;
+ s64 *dst_smax = &dst_reg->smax_value;
+ u64 *dst_umin = &dst_reg->umin_value;
+ u64 *dst_umax = &dst_reg->umax_value;
- if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
- signed_add_overflows(dst_reg->smax_value, smax_val)) {
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
- } else {
- dst_reg->smin_value += smin_val;
- dst_reg->smax_value += smax_val;
+ if (check_add_overflow(*dst_smin, src_reg->smin_value, dst_smin) ||
+ check_add_overflow(*dst_smax, src_reg->smax_value, dst_smax)) {
+ *dst_smin = S64_MIN;
+ *dst_smax = S64_MAX;
}
- if (dst_reg->umin_value + umin_val < umin_val ||
- dst_reg->umax_value + umax_val < umax_val) {
- dst_reg->umin_value = 0;
- dst_reg->umax_value = U64_MAX;
- } else {
- dst_reg->umin_value += umin_val;
- dst_reg->umax_value += umax_val;
+ if (check_add_overflow(*dst_umin, src_reg->umin_value, dst_umin) ||
+ check_add_overflow(*dst_umax, src_reg->umax_value, dst_umax)) {
+ *dst_umin = 0;
+ *dst_umax = U64_MAX;
}
}
static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s32 smin_val = src_reg->s32_min_value;
- s32 smax_val = src_reg->s32_max_value;
+ s32 *dst_smin = &dst_reg->s32_min_value;
+ s32 *dst_smax = &dst_reg->s32_max_value;
u32 umin_val = src_reg->u32_min_value;
u32 umax_val = src_reg->u32_max_value;
- if (signed_sub32_overflows(dst_reg->s32_min_value, smax_val) ||
- signed_sub32_overflows(dst_reg->s32_max_value, smin_val)) {
+ if (check_sub_overflow(*dst_smin, src_reg->s32_max_value, dst_smin) ||
+ check_sub_overflow(*dst_smax, src_reg->s32_min_value, dst_smax)) {
/* Overflow possible, we know nothing */
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
- } else {
- dst_reg->s32_min_value -= smax_val;
- dst_reg->s32_max_value -= smin_val;
+ *dst_smin = S32_MIN;
+ *dst_smax = S32_MAX;
}
if (dst_reg->u32_min_value < umax_val) {
/* Overflow possible, we know nothing */
@@ -13429,19 +13380,16 @@ static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg,
static void scalar_min_max_sub(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s64 smin_val = src_reg->smin_value;
- s64 smax_val = src_reg->smax_value;
+ s64 *dst_smin = &dst_reg->smin_value;
+ s64 *dst_smax = &dst_reg->smax_value;
u64 umin_val = src_reg->umin_value;
u64 umax_val = src_reg->umax_value;
- if (signed_sub_overflows(dst_reg->smin_value, smax_val) ||
- signed_sub_overflows(dst_reg->smax_value, smin_val)) {
+ if (check_sub_overflow(*dst_smin, src_reg->smax_value, dst_smin) ||
+ check_sub_overflow(*dst_smax, src_reg->smin_value, dst_smax)) {
/* Overflow possible, we know nothing */
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
- } else {
- dst_reg->smin_value -= smax_val;
- dst_reg->smax_value -= smin_val;
+ *dst_smin = S64_MIN;
+ *dst_smax = S64_MAX;
}
if (dst_reg->umin_value < umax_val) {
/* Overflow possible, we know nothing */
@@ -14047,6 +13995,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg;
struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
+ bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);
u8 opcode = BPF_OP(insn->code);
int err;
@@ -14069,11 +14018,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
if (dst_reg->type != SCALAR_VALUE)
ptr_reg = dst_reg;
- else
- /* Make sure ID is cleared otherwise dst_reg min/max could be
- * incorrectly propagated into other registers by find_equal_scalars()
- */
- dst_reg->id = 0;
+
if (BPF_SRC(insn->code) == BPF_X) {
src_reg = &regs[insn->src_reg];
if (src_reg->type != SCALAR_VALUE) {
@@ -14137,7 +14082,43 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
verbose(env, "verifier internal error: no src_reg\n");
return -EINVAL;
}
- return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
+ err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
+ if (err)
+ return err;
+ /*
+ * Compilers can generate the code
+ * r1 = r2
+ * r1 += 0x1
+ * if r2 < 1000 goto ...
+ * use r1 in memory access
+ * So remember constant delta between r2 and r1 and update r1 after
+ * 'if' condition.
+ */
+ if (env->bpf_capable && BPF_OP(insn->code) == BPF_ADD &&
+ dst_reg->id && is_reg_const(src_reg, alu32)) {
+ u64 val = reg_const_value(src_reg, alu32);
+
+ if ((dst_reg->id & BPF_ADD_CONST) ||
+ /* prevent overflow in find_equal_scalars() later */
+ val > (u32)S32_MAX) {
+ /*
+ * If the register already went through rX += val
+ * we cannot accumulate another val into rx->off.
+ */
+ dst_reg->off = 0;
+ dst_reg->id = 0;
+ } else {
+ dst_reg->id |= BPF_ADD_CONST;
+ dst_reg->off = val;
+ }
+ } else {
+ /*
+ * Make sure ID is cleared otherwise dst_reg min/max could be
+ * incorrectly propagated into other registers by find_equal_scalars()
+ */
+ dst_reg->id = 0;
+ }
+ return 0;
}
/* check validity of 32-bit and 64-bit arithmetic operations */
@@ -15109,12 +15090,36 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
static void find_equal_scalars(struct bpf_verifier_state *vstate,
struct bpf_reg_state *known_reg)
{
+ struct bpf_reg_state fake_reg;
struct bpf_func_state *state;
struct bpf_reg_state *reg;
bpf_for_each_reg_in_vstate(vstate, state, reg, ({
- if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
+ if (reg->type != SCALAR_VALUE || reg == known_reg)
+ continue;
+ if ((reg->id & ~BPF_ADD_CONST) != (known_reg->id & ~BPF_ADD_CONST))
+ continue;
+ if ((!(reg->id & BPF_ADD_CONST) && !(known_reg->id & BPF_ADD_CONST)) ||
+ reg->off == known_reg->off) {
copy_register_state(reg, known_reg);
+ } else {
+ s32 saved_off = reg->off;
+
+ fake_reg.type = SCALAR_VALUE;
+ __mark_reg_known(&fake_reg, (s32)reg->off - (s32)known_reg->off);
+
+ /* reg = known_reg; reg += delta */
+ copy_register_state(reg, known_reg);
+ /*
+ * Must preserve off, id and add_const flag,
+ * otherwise another find_equal_scalars() will be incorrect.
+ */
+ reg->off = saved_off;
+
+ scalar32_min_max_add(reg, &fake_reg);
+ scalar_min_max_add(reg, &fake_reg);
+ reg->var_off = tnum_add(reg->var_off, fake_reg.var_off);
+ }
}));
}
@@ -16749,6 +16754,10 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
}
if (!rold->precise && exact == NOT_EXACT)
return true;
+ if ((rold->id & BPF_ADD_CONST) != (rcur->id & BPF_ADD_CONST))
+ return false;
+ if ((rold->id & BPF_ADD_CONST) && (rold->off != rcur->off))
+ return false;
/* Why check_ids() for scalar registers?
*
* Consider the following BPF code:
@@ -18630,8 +18639,7 @@ static void release_maps(struct bpf_verifier_env *env)
/* drop refcnt of maps used by the rejected program */
static void release_btfs(struct bpf_verifier_env *env)
{
- __bpf_free_used_btfs(env->prog->aux, env->used_btfs,
- env->used_btf_cnt);
+ __bpf_free_used_btfs(env->used_btfs, env->used_btf_cnt);
}
/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
@@ -18750,6 +18758,8 @@ static int adjust_jmp_off(struct bpf_prog *prog, u32 tgt_idx, u32 delta)
{
struct bpf_insn *insn = prog->insnsi;
u32 insn_cnt = prog->len, i;
+ s32 imm;
+ s16 off;
for (i = 0; i < insn_cnt; i++, insn++) {
u8 code = insn->code;
@@ -18761,15 +18771,15 @@ static int adjust_jmp_off(struct bpf_prog *prog, u32 tgt_idx, u32 delta)
if (insn->code == (BPF_JMP32 | BPF_JA)) {
if (i + 1 + insn->imm != tgt_idx)
continue;
- if (signed_add32_overflows(insn->imm, delta))
+ if (check_add_overflow(insn->imm, delta, &imm))
return -ERANGE;
- insn->imm += delta;
+ insn->imm = imm;
} else {
if (i + 1 + insn->off != tgt_idx)
continue;
- if (signed_add16_overflows(insn->imm, delta))
+ if (check_add_overflow(insn->off, delta, &off))
return -ERANGE;
- insn->off += delta;
+ insn->off = off;
}
}
return 0;
@@ -21122,8 +21132,12 @@ BTF_SET_START(btf_non_sleepable_error_inject)
* Assume non-sleepable from bpf safety point of view.
*/
BTF_ID(func, __filemap_add_folio)
+#ifdef CONFIG_FAIL_PAGE_ALLOC
BTF_ID(func, should_fail_alloc_page)
+#endif
+#ifdef CONFIG_FAILSLAB
BTF_ID(func, should_failslab)
+#endif
BTF_SET_END(btf_non_sleepable_error_inject)
static int check_non_sleepable_error_inject(u32 btf_id)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index e32b6972c478..c8e4b62b436a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1744,8 +1744,11 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
if (cgroup_psi_enabled()) {
ret = cgroup_addrm_files(css, cgrp,
cgroup_psi_files, true);
- if (ret < 0)
+ if (ret < 0) {
+ cgroup_addrm_files(css, cgrp,
+ cgroup_base_files, false);
return ret;
+ }
}
} else {
ret = cgroup_addrm_files(css, cgrp,
@@ -1839,9 +1842,9 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
rcu_assign_pointer(dcgrp->subsys[ssid], css);
ss->root = dst_root;
- css->cgroup = dcgrp;
spin_lock_irq(&css_set_lock);
+ css->cgroup = dcgrp;
WARN_ON(!list_empty(&dcgrp->e_csets[ss->id]));
list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id],
e_cset_node[ss->id]) {
@@ -1922,6 +1925,7 @@ enum cgroup2_param {
Opt_memory_localevents,
Opt_memory_recursiveprot,
Opt_memory_hugetlb_accounting,
+ Opt_pids_localevents,
nr__cgroup2_params
};
@@ -1931,6 +1935,7 @@ static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
fsparam_flag("memory_localevents", Opt_memory_localevents),
fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting),
+ fsparam_flag("pids_localevents", Opt_pids_localevents),
{}
};
@@ -1960,6 +1965,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
case Opt_memory_hugetlb_accounting:
ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
return 0;
+ case Opt_pids_localevents:
+ ctx->flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS;
+ return 0;
}
return -EINVAL;
}
@@ -1989,6 +1997,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+
+ if (root_flags & CGRP_ROOT_PIDS_LOCAL_EVENTS)
+ cgrp_dfl_root.flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_PIDS_LOCAL_EVENTS;
}
}
@@ -2004,6 +2017,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
seq_puts(seq, ",memory_recursiveprot");
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
seq_puts(seq, ",memory_hugetlb_accounting");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS)
+ seq_puts(seq, ",pids_localevents");
return 0;
}
@@ -6686,8 +6701,10 @@ void cgroup_exit(struct task_struct *tsk)
WARN_ON_ONCE(list_empty(&tsk->cg_list));
cset = task_css_set(tsk);
css_set_move_task(tsk, cset, NULL, false);
- list_add_tail(&tsk->cg_list, &cset->dying_tasks);
cset->nr_tasks--;
+ /* matches the signal->live check in css_task_iter_advance() */
+ if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
+ list_add_tail(&tsk->cg_list, &cset->dying_tasks);
if (dl_task(tsk))
dec_dl_tasks_cs(tsk);
@@ -6714,10 +6731,12 @@ void cgroup_release(struct task_struct *task)
ss->release(task);
} while_each_subsys_mask();
- spin_lock_irq(&css_set_lock);
- css_set_skip_task_iters(task_css_set(task), task);
- list_del_init(&task->cg_list);
- spin_unlock_irq(&css_set_lock);
+ if (!list_empty(&task->cg_list)) {
+ spin_lock_irq(&css_set_lock);
+ css_set_skip_task_iters(task_css_set(task), task);
+ list_del_init(&task->cg_list);
+ spin_unlock_irq(&css_set_lock);
+ }
}
void cgroup_free(struct task_struct *task)
@@ -7062,7 +7081,8 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
"favordynmods\n"
"memory_localevents\n"
"memory_recursiveprot\n"
- "memory_hugetlb_accounting\n");
+ "memory_hugetlb_accounting\n"
+ "pids_localevents\n");
}
static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index c12b9fdb22a4..40ec4abaf440 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -21,6 +21,7 @@
* License. See the file COPYING in the main directory of the Linux
* distribution for more details.
*/
+#include "cgroup-internal.h"
#include <linux/cpu.h>
#include <linux/cpumask.h>
@@ -87,7 +88,7 @@ static const char * const perr_strings[] = {
[PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive",
[PERR_NOCPUS] = "Parent unable to distribute cpu downstream",
[PERR_HOTPLUG] = "No cpu available due to hotplug",
- [PERR_CPUSEMPTY] = "cpuset.cpus is empty",
+ [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty",
[PERR_HKEEPING] = "partition config conflicts with housekeeping setup",
};
@@ -127,19 +128,28 @@ struct cpuset {
/*
* Exclusive CPUs dedicated to current cgroup (default hierarchy only)
*
- * This exclusive CPUs must be a subset of cpus_allowed. A parent
- * cgroup can only grant exclusive CPUs to one of its children.
+ * The effective_cpus of a valid partition root comes solely from its
+ * effective_xcpus and some of the effective_xcpus may be distributed
+ * to sub-partitions below & hence excluded from its effective_cpus.
+ * For a valid partition root, its effective_cpus have no relationship
+ * with cpus_allowed unless its exclusive_cpus isn't set.
*
- * When the cgroup becomes a valid partition root, effective_xcpus
- * defaults to cpus_allowed if not set. The effective_cpus of a valid
- * partition root comes solely from its effective_xcpus and some of the
- * effective_xcpus may be distributed to sub-partitions below & hence
- * excluded from its effective_cpus.
+ * This value will only be set if either exclusive_cpus is set or
+ * when this cpuset becomes a local partition root.
*/
cpumask_var_t effective_xcpus;
/*
* Exclusive CPUs as requested by the user (default hierarchy only)
+ *
+ * Its value is independent of cpus_allowed and designates the set of
+ * CPUs that can be granted to the current cpuset or its children when
+ * it becomes a valid partition root. The effective set of exclusive
+ * CPUs granted (effective_xcpus) depends on whether those exclusive
+ * CPUs are passed down by its ancestors and not yet taken up by
+ * another sibling partition root along the way.
+ *
+ * If its value isn't set, it defaults to cpus_allowed.
*/
cpumask_var_t exclusive_cpus;
@@ -169,7 +179,7 @@ struct cpuset {
/* for custom sched domain */
int relax_domain_level;
- /* number of valid sub-partitions */
+ /* number of valid local child partitions */
int nr_subparts;
/* partition root state */
@@ -230,6 +240,17 @@ static struct list_head remote_children;
* 2 - partition root without load balancing (isolated)
* -1 - invalid partition root
* -2 - invalid isolated partition root
+ *
+ * There are 2 types of partitions - local or remote. Local partitions are
+ * those whose parents are partition root themselves. Setting of
+ * cpuset.cpus.exclusive are optional in setting up local partitions.
+ * Remote partitions are those whose parents are not partition roots. Passing
+ * down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor
+ * nodes are mandatory in creating a remote partition.
+ *
+ * For simplicity, a local partition can be created under a local or remote
+ * partition but a remote partition cannot have any partition root in its
+ * ancestor chain except the cgroup root.
*/
#define PRS_MEMBER 0
#define PRS_ROOT 1
@@ -434,7 +455,7 @@ static struct cpuset top_cpuset = {
* by other task, we use alloc_lock in the task_struct fields to protect
* them.
*
- * The cpuset_common_file_read() handlers only hold callback_lock across
+ * The cpuset_common_seq_show() handlers only hold callback_lock across
* small pieces of code, such as when reading out possibly multi-word
* cpumasks and nodemasks.
*
@@ -709,6 +730,19 @@ static inline void free_cpuset(struct cpuset *cs)
kfree(cs);
}
+/* Return user specified exclusive CPUs */
+static inline struct cpumask *user_xcpus(struct cpuset *cs)
+{
+ return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed
+ : cs->exclusive_cpus;
+}
+
+static inline bool xcpus_empty(struct cpuset *cs)
+{
+ return cpumask_empty(cs->cpus_allowed) &&
+ cpumask_empty(cs->exclusive_cpus);
+}
+
static inline struct cpumask *fetch_xcpus(struct cpuset *cs)
{
return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus :
@@ -825,17 +859,41 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
/*
* If either I or some sibling (!= me) is exclusive, we can't
- * overlap
+ * overlap. exclusive_cpus cannot overlap with each other if set.
*/
ret = -EINVAL;
cpuset_for_each_child(c, css, par) {
- if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
- c != cur) {
+ bool txset, cxset; /* Are exclusive_cpus set? */
+
+ if (c == cur)
+ continue;
+
+ txset = !cpumask_empty(trial->exclusive_cpus);
+ cxset = !cpumask_empty(c->exclusive_cpus);
+ if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) ||
+ (txset && cxset)) {
if (!cpusets_are_exclusive(trial, c))
goto out;
+ } else if (txset || cxset) {
+ struct cpumask *xcpus, *acpus;
+
+ /*
+ * When just one of the exclusive_cpus's is set,
+ * cpus_allowed of the other cpuset, if set, cannot be
+ * a subset of it or none of those CPUs will be
+ * available if these exclusive CPUs are activated.
+ */
+ if (txset) {
+ xcpus = trial->exclusive_cpus;
+ acpus = c->cpus_allowed;
+ } else {
+ xcpus = c->exclusive_cpus;
+ acpus = trial->cpus_allowed;
+ }
+ if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus))
+ goto out;
}
if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
- c != cur &&
nodes_intersects(trial->mems_allowed, c->mems_allowed))
goto out;
}
@@ -957,13 +1015,15 @@ static int generate_sched_domains(cpumask_var_t **domains,
int nslot; /* next empty doms[] struct cpumask slot */
struct cgroup_subsys_state *pos_css;
bool root_load_balance = is_sched_load_balance(&top_cpuset);
+ bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
doms = NULL;
dattr = NULL;
csa = NULL;
/* Special case for the 99% of systems with one, full, sched domain */
- if (root_load_balance && !top_cpuset.nr_subparts) {
+ if (root_load_balance && cpumask_empty(subpartitions_cpus)) {
+single_root_domain:
ndoms = 1;
doms = alloc_sched_domains(ndoms);
if (!doms)
@@ -991,16 +1051,18 @@ static int generate_sched_domains(cpumask_var_t **domains,
cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
if (cp == &top_cpuset)
continue;
+
+ if (cgrpv2)
+ goto v2;
+
/*
+ * v1:
* Continue traversing beyond @cp iff @cp has some CPUs and
* isn't load balancing. The former is obvious. The
* latter: All child cpusets contain a subset of the
* parent's cpus, so just skip them, and then we call
* update_domain_attr_tree() to calc relax_domain_level of
* the corresponding sched domain.
- *
- * If root is load-balancing, we can skip @cp if it
- * is a subset of the root's effective_cpus.
*/
if (!cpumask_empty(cp->cpus_allowed) &&
!(is_sched_load_balance(cp) &&
@@ -1008,20 +1070,39 @@ static int generate_sched_domains(cpumask_var_t **domains,
housekeeping_cpumask(HK_TYPE_DOMAIN))))
continue;
- if (root_load_balance &&
- cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
- continue;
-
if (is_sched_load_balance(cp) &&
!cpumask_empty(cp->effective_cpus))
csa[csn++] = cp;
- /* skip @cp's subtree if not a partition root */
- if (!is_partition_valid(cp))
+ /* skip @cp's subtree */
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+
+v2:
+ /*
+ * Only valid partition roots that are not isolated and with
+ * non-empty effective_cpus will be saved into csn[].
+ */
+ if ((cp->partition_root_state == PRS_ROOT) &&
+ !cpumask_empty(cp->effective_cpus))
+ csa[csn++] = cp;
+
+ /*
+ * Skip @cp's subtree if not a partition root and has no
+ * exclusive CPUs to be granted to child cpusets.
+ */
+ if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus))
pos_css = css_rightmost_descendant(pos_css);
}
rcu_read_unlock();
+ /*
+ * If there are only isolated partitions underneath the cgroup root,
+ * we can optimize out unneeded sched domains scanning.
+ */
+ if (root_load_balance && (csn == 1))
+ goto single_root_domain;
+
for (i = 0; i < csn; i++)
csa[i]->pn = i;
ndoms = csn;
@@ -1064,6 +1145,20 @@ restart:
dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
GFP_KERNEL);
+ /*
+ * Cgroup v2 doesn't support domain attributes, just set all of them
+ * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a
+ * subset of HK_TYPE_DOMAIN housekeeping CPUs.
+ */
+ if (cgrpv2) {
+ for (i = 0; i < ndoms; i++) {
+ cpumask_copy(doms[i], csa[i]->effective_cpus);
+ if (dattr)
+ dattr[i] = SD_ATTR_INIT;
+ }
+ goto done;
+ }
+
for (nslot = 0, i = 0; i < csn; i++) {
struct cpuset *a = csa[i];
struct cpumask *dp;
@@ -1223,7 +1318,7 @@ static void rebuild_sched_domains_locked(void)
* root should be only a subset of the active CPUs. Since a CPU in any
* partition root could be offlined, all must be checked.
*/
- if (top_cpuset.nr_subparts) {
+ if (!cpumask_empty(subpartitions_cpus)) {
rcu_read_lock();
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
if (!is_partition_valid(cs)) {
@@ -1338,7 +1433,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
*/
static int update_partition_exclusive(struct cpuset *cs, int new_prs)
{
- bool exclusive = (new_prs > 0);
+ bool exclusive = (new_prs > PRS_MEMBER);
if (exclusive && !is_cpu_exclusive(cs)) {
if (update_flag(CS_CPU_EXCLUSIVE, cs, 1))
@@ -1532,7 +1627,7 @@ EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
* Return: true if xcpus is not empty, false otherwise.
*
* Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set),
- * it must be a subset of cpus_allowed and parent's effective_xcpus.
+ * it must be a subset of parent's effective_xcpus.
*/
static bool compute_effective_exclusive_cpumask(struct cpuset *cs,
struct cpumask *xcpus)
@@ -1542,12 +1637,7 @@ static bool compute_effective_exclusive_cpumask(struct cpuset *cs,
if (!xcpus)
xcpus = cs->effective_xcpus;
- if (!cpumask_empty(cs->exclusive_cpus))
- cpumask_and(xcpus, cs->exclusive_cpus, cs->cpus_allowed);
- else
- cpumask_copy(xcpus, cs->cpus_allowed);
-
- return cpumask_and(xcpus, xcpus, parent->effective_xcpus);
+ return cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus);
}
static inline bool is_remote_partition(struct cpuset *cs)
@@ -1826,8 +1916,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
*/
adding = deleting = false;
old_prs = new_prs = cs->partition_root_state;
- xcpus = !cpumask_empty(cs->exclusive_cpus)
- ? cs->effective_xcpus : cs->cpus_allowed;
+ xcpus = user_xcpus(cs);
if (cmd == partcmd_invalidate) {
if (is_prs_invalid(old_prs))
@@ -1855,7 +1944,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
return is_partition_invalid(parent)
? PERR_INVPARENT : PERR_NOTPART;
}
- if (!newmask && cpumask_empty(cs->cpus_allowed))
+ if (!newmask && xcpus_empty(cs))
return PERR_CPUSEMPTY;
nocpu = tasks_nocpu_error(parent, cs, xcpus);
@@ -2583,8 +2672,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
retval = cpulist_parse(buf, trialcs->exclusive_cpus);
if (retval < 0)
return retval;
- if (!is_cpu_exclusive(cs))
- set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags);
}
/* Nothing to do if the CPUs didn't change */
@@ -3071,9 +3158,9 @@ static int update_prstate(struct cpuset *cs, int new_prs)
? partcmd_enable : partcmd_enablei;
/*
- * cpus_allowed cannot be empty.
+ * cpus_allowed and exclusive_cpus cannot be both empty.
*/
- if (cpumask_empty(cs->cpus_allowed)) {
+ if (xcpus_empty(cs)) {
err = PERR_CPUSEMPTY;
goto out;
}
@@ -4009,8 +4096,6 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
}
__set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
- nodes_clear(cs->mems_allowed);
- nodes_clear(cs->effective_mems);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
INIT_LIST_HEAD(&cs->remote_sibling);
@@ -4040,6 +4125,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
set_bit(CS_SPREAD_PAGE, &cs->flags);
if (is_spread_slab(parent))
set_bit(CS_SPREAD_SLAB, &cs->flags);
+ /*
+ * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
+ */
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ !is_sched_load_balance(parent))
+ clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpuset_inc();
@@ -4050,14 +4141,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cs->use_parent_ecpus = true;
parent->child_ecpus_count++;
}
-
- /*
- * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
- */
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- !is_sched_load_balance(parent))
- clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
-
spin_unlock_irq(&callback_lock);
if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
@@ -4571,7 +4654,7 @@ static void cpuset_handle_hotplug(void)
* In the rare case that hotplug removes all the cpus in
* subpartitions_cpus, we assumed that cpus are updated.
*/
- if (!cpus_updated && top_cpuset.nr_subparts)
+ if (!cpus_updated && !cpumask_empty(subpartitions_cpus))
cpus_updated = true;
/* For v1, synchronize cpus_allowed to cpu_active_mask */
@@ -5051,10 +5134,14 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
if (!buf)
goto out;
- css = task_get_css(tsk, cpuset_cgrp_id);
- retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
- current->nsproxy->cgroup_ns);
- css_put(css);
+ rcu_read_lock();
+ spin_lock_irq(&css_set_lock);
+ css = task_css(tsk, cpuset_cgrp_id);
+ retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
+ current->nsproxy->cgroup_ns);
+ spin_unlock_irq(&css_set_lock);
+ rcu_read_unlock();
+
if (retval == -E2BIG)
retval = -ENAMETOOLONG;
if (retval < 0)
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
index 79a3717a5803..0e26068995a6 100644
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -121,6 +121,30 @@ static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg,
misc_res_name[type]);
}
+static void misc_cg_update_watermark(struct misc_res *res, u64 new_usage)
+{
+ u64 old;
+
+ while (true) {
+ old = atomic64_read(&res->watermark);
+ if (new_usage <= old)
+ break;
+ if (atomic64_cmpxchg(&res->watermark, old, new_usage) == old)
+ break;
+ }
+}
+
+static void misc_cg_event(enum misc_res_type type, struct misc_cg *cg)
+{
+ atomic64_inc(&cg->res[type].events_local);
+ cgroup_file_notify(&cg->events_local_file);
+
+ for (; parent_misc(cg); cg = parent_misc(cg)) {
+ atomic64_inc(&cg->res[type].events);
+ cgroup_file_notify(&cg->events_file);
+ }
+}
+
/**
* misc_cg_try_charge() - Try charging the misc cgroup.
* @type: Misc res type to charge.
@@ -159,14 +183,12 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount)
ret = -EBUSY;
goto err_charge;
}
+ misc_cg_update_watermark(res, new_usage);
}
return 0;
err_charge:
- for (j = i; j; j = parent_misc(j)) {
- atomic64_inc(&j->res[type].events);
- cgroup_file_notify(&j->events_file);
- }
+ misc_cg_event(type, i);
for (j = cg; j != i; j = parent_misc(j))
misc_cg_cancel_charge(type, j, amount);
@@ -308,6 +330,29 @@ static int misc_cg_current_show(struct seq_file *sf, void *v)
}
/**
+ * misc_cg_peak_show() - Show the peak usage of the misc cgroup.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_peak_show(struct seq_file *sf, void *v)
+{
+ int i;
+ u64 watermark;
+ struct misc_cg *cg = css_misc(seq_css(sf));
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ watermark = atomic64_read(&cg->res[i].watermark);
+ if (READ_ONCE(misc_res_capacity[i]) || watermark)
+ seq_printf(sf, "%s %llu\n", misc_res_name[i], watermark);
+ }
+
+ return 0;
+}
+
+/**
* misc_cg_capacity_show() - Show the total capacity of misc res on the host.
* @sf: Interface file
* @v: Arguments passed
@@ -331,20 +376,33 @@ static int misc_cg_capacity_show(struct seq_file *sf, void *v)
return 0;
}
-static int misc_events_show(struct seq_file *sf, void *v)
+static int __misc_events_show(struct seq_file *sf, bool local)
{
struct misc_cg *cg = css_misc(seq_css(sf));
u64 events;
int i;
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
- events = atomic64_read(&cg->res[i].events);
+ if (local)
+ events = atomic64_read(&cg->res[i].events_local);
+ else
+ events = atomic64_read(&cg->res[i].events);
if (READ_ONCE(misc_res_capacity[i]) || events)
seq_printf(sf, "%s.max %llu\n", misc_res_name[i], events);
}
return 0;
}
+static int misc_events_show(struct seq_file *sf, void *v)
+{
+ return __misc_events_show(sf, false);
+}
+
+static int misc_events_local_show(struct seq_file *sf, void *v)
+{
+ return __misc_events_show(sf, true);
+}
+
/* Misc cgroup interface files */
static struct cftype misc_cg_files[] = {
{
@@ -358,6 +416,10 @@ static struct cftype misc_cg_files[] = {
.seq_show = misc_cg_current_show,
},
{
+ .name = "peak",
+ .seq_show = misc_cg_peak_show,
+ },
+ {
.name = "capacity",
.seq_show = misc_cg_capacity_show,
.flags = CFTYPE_ONLY_ON_ROOT,
@@ -368,6 +430,12 @@ static struct cftype misc_cg_files[] = {
.file_offset = offsetof(struct misc_cg, events_file),
.seq_show = misc_events_show,
},
+ {
+ .name = "events.local",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct misc_cg, events_local_file),
+ .seq_show = misc_events_local_show,
+ },
{}
};
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 0e5ec7d59b4d..f5cb0ec45b9d 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -38,6 +38,14 @@
#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
#define PIDS_MAX_STR "max"
+enum pidcg_event {
+ /* Fork failed in subtree because this pids_cgroup limit was hit. */
+ PIDCG_MAX,
+ /* Fork failed in this pids_cgroup because ancestor limit was hit. */
+ PIDCG_FORKFAIL,
+ NR_PIDCG_EVENTS,
+};
+
struct pids_cgroup {
struct cgroup_subsys_state css;
@@ -49,11 +57,12 @@ struct pids_cgroup {
atomic64_t limit;
int64_t watermark;
- /* Handle for "pids.events" */
+ /* Handles for pids.events[.local] */
struct cgroup_file events_file;
+ struct cgroup_file events_local_file;
- /* Number of times fork failed because limit was hit. */
- atomic64_t events_limit;
+ atomic64_t events[NR_PIDCG_EVENTS];
+ atomic64_t events_local[NR_PIDCG_EVENTS];
};
static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
@@ -148,12 +157,13 @@ static void pids_charge(struct pids_cgroup *pids, int num)
* pids_try_charge - hierarchically try to charge the pid count
* @pids: the pid cgroup state
* @num: the number of pids to charge
+ * @fail: storage of pid cgroup causing the fail
*
* This function follows the set limit. It will fail if the charge would cause
* the new value to exceed the hierarchical limit. Returns 0 if the charge
* succeeded, otherwise -EAGAIN.
*/
-static int pids_try_charge(struct pids_cgroup *pids, int num)
+static int pids_try_charge(struct pids_cgroup *pids, int num, struct pids_cgroup **fail)
{
struct pids_cgroup *p, *q;
@@ -166,9 +176,10 @@ static int pids_try_charge(struct pids_cgroup *pids, int num)
* p->limit is %PIDS_MAX then we know that this test will never
* fail.
*/
- if (new > limit)
+ if (new > limit) {
+ *fail = p;
goto revert;
-
+ }
/*
* Not technically accurate if we go over limit somewhere up
* the hierarchy, but that's tolerable for the watermark.
@@ -229,6 +240,36 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
}
}
+static void pids_event(struct pids_cgroup *pids_forking,
+ struct pids_cgroup *pids_over_limit)
+{
+ struct pids_cgroup *p = pids_forking;
+ bool limit = false;
+
+ /* Only log the first time limit is hit. */
+ if (atomic64_inc_return(&p->events_local[PIDCG_FORKFAIL]) == 1) {
+ pr_info("cgroup: fork rejected by pids controller in ");
+ pr_cont_cgroup_path(p->css.cgroup);
+ pr_cont("\n");
+ }
+ cgroup_file_notify(&p->events_local_file);
+ if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) ||
+ cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS)
+ return;
+
+ for (; parent_pids(p); p = parent_pids(p)) {
+ if (p == pids_over_limit) {
+ limit = true;
+ atomic64_inc(&p->events_local[PIDCG_MAX]);
+ cgroup_file_notify(&p->events_local_file);
+ }
+ if (limit)
+ atomic64_inc(&p->events[PIDCG_MAX]);
+
+ cgroup_file_notify(&p->events_file);
+ }
+}
+
/*
* task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
* on cgroup_threadgroup_change_begin() held by the copy_process().
@@ -236,7 +277,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
static int pids_can_fork(struct task_struct *task, struct css_set *cset)
{
struct cgroup_subsys_state *css;
- struct pids_cgroup *pids;
+ struct pids_cgroup *pids, *pids_over_limit;
int err;
if (cset)
@@ -244,16 +285,10 @@ static int pids_can_fork(struct task_struct *task, struct css_set *cset)
else
css = task_css_check(current, pids_cgrp_id, true);
pids = css_pids(css);
- err = pids_try_charge(pids, 1);
- if (err) {
- /* Only log the first time events_limit is incremented. */
- if (atomic64_inc_return(&pids->events_limit) == 1) {
- pr_info("cgroup: fork rejected by pids controller in ");
- pr_cont_cgroup_path(css->cgroup);
- pr_cont("\n");
- }
- cgroup_file_notify(&pids->events_file);
- }
+ err = pids_try_charge(pids, 1, &pids_over_limit);
+ if (err)
+ pids_event(pids, pids_over_limit);
+
return err;
}
@@ -337,11 +372,32 @@ static s64 pids_peak_read(struct cgroup_subsys_state *css,
return READ_ONCE(pids->watermark);
}
-static int pids_events_show(struct seq_file *sf, void *v)
+static int __pids_events_show(struct seq_file *sf, bool local)
{
struct pids_cgroup *pids = css_pids(seq_css(sf));
+ enum pidcg_event pe = PIDCG_MAX;
+ atomic64_t *events;
+
+ if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) ||
+ cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) {
+ pe = PIDCG_FORKFAIL;
+ local = true;
+ }
+ events = local ? pids->events_local : pids->events;
+
+ seq_printf(sf, "max %lld\n", (s64)atomic64_read(&events[pe]));
+ return 0;
+}
- seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit));
+static int pids_events_show(struct seq_file *sf, void *v)
+{
+ __pids_events_show(sf, false);
+ return 0;
+}
+
+static int pids_events_local_show(struct seq_file *sf, void *v)
+{
+ __pids_events_show(sf, true);
return 0;
}
@@ -368,9 +424,42 @@ static struct cftype pids_files[] = {
.file_offset = offsetof(struct pids_cgroup, events_file),
.flags = CFTYPE_NOT_ON_ROOT,
},
+ {
+ .name = "events.local",
+ .seq_show = pids_events_local_show,
+ .file_offset = offsetof(struct pids_cgroup, events_local_file),
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ { } /* terminate */
+};
+
+static struct cftype pids_files_legacy[] = {
+ {
+ .name = "max",
+ .write = pids_max_write,
+ .seq_show = pids_max_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "current",
+ .read_s64 = pids_current_read,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "peak",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = pids_peak_read,
+ },
+ {
+ .name = "events",
+ .seq_show = pids_events_show,
+ .file_offset = offsetof(struct pids_cgroup, events_file),
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
{ } /* terminate */
};
+
struct cgroup_subsys pids_cgrp_subsys = {
.css_alloc = pids_css_alloc,
.css_free = pids_css_free,
@@ -379,7 +468,7 @@ struct cgroup_subsys pids_cgrp_subsys = {
.can_fork = pids_can_fork,
.cancel_fork = pids_cancel_fork,
.release = pids_release,
- .legacy_cftypes = pids_files,
+ .legacy_cftypes = pids_files_legacy,
.dfl_cftypes = pids_files,
.threaded = true,
};
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index fb8b49437573..a06b45272411 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -594,49 +594,46 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
}
}
+
+static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat *bstat)
+{
+#ifdef CONFIG_SCHED_CORE
+ u64 forceidle_time = bstat->forceidle_sum;
+
+ do_div(forceidle_time, NSEC_PER_USEC);
+ seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
+#endif
+}
+
void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
u64 usage, utime, stime;
- struct cgroup_base_stat bstat;
-#ifdef CONFIG_SCHED_CORE
- u64 forceidle_time;
-#endif
if (cgroup_parent(cgrp)) {
cgroup_rstat_flush_hold(cgrp);
usage = cgrp->bstat.cputime.sum_exec_runtime;
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
&utime, &stime);
-#ifdef CONFIG_SCHED_CORE
- forceidle_time = cgrp->bstat.forceidle_sum;
-#endif
cgroup_rstat_flush_release(cgrp);
} else {
- root_cgroup_cputime(&bstat);
- usage = bstat.cputime.sum_exec_runtime;
- utime = bstat.cputime.utime;
- stime = bstat.cputime.stime;
-#ifdef CONFIG_SCHED_CORE
- forceidle_time = bstat.forceidle_sum;
-#endif
+ /* cgrp->bstat of root is not actually used, reuse it */
+ root_cgroup_cputime(&cgrp->bstat);
+ usage = cgrp->bstat.cputime.sum_exec_runtime;
+ utime = cgrp->bstat.cputime.utime;
+ stime = cgrp->bstat.cputime.stime;
}
do_div(usage, NSEC_PER_USEC);
do_div(utime, NSEC_PER_USEC);
do_div(stime, NSEC_PER_USEC);
-#ifdef CONFIG_SCHED_CORE
- do_div(forceidle_time, NSEC_PER_USEC);
-#endif
seq_printf(seq, "usage_usec %llu\n"
"user_usec %llu\n"
"system_usec %llu\n",
usage, utime, stime);
-#ifdef CONFIG_SCHED_CORE
- seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
-#endif
+ cgroup_force_idle_show(seq, &cgrp->bstat);
}
/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config
index 8a7ce7a6b3ab..3fabb8f55ef6 100644
--- a/kernel/configs/hardening.config
+++ b/kernel/configs/hardening.config
@@ -20,6 +20,7 @@ CONFIG_RANDOMIZE_MEMORY=y
# Randomize allocator freelists, harden metadata.
CONFIG_SLAB_FREELIST_RANDOM=y
CONFIG_SLAB_FREELIST_HARDENED=y
+CONFIG_SLAB_BUCKETS=y
CONFIG_SHUFFLE_PAGE_ALLOCATOR=y
CONFIG_RANDOM_KMALLOC_CACHES=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3d2bf1d50a0c..1209ddaec026 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -483,6 +483,8 @@ static int cpu_hotplug_disabled;
DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock);
+static bool cpu_hotplug_offline_disabled __ro_after_init;
+
void cpus_read_lock(void)
{
percpu_down_read(&cpu_hotplug_lock);
@@ -542,6 +544,14 @@ static void lockdep_release_cpus_lock(void)
rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_);
}
+/* Declare CPU offlining not supported */
+void cpu_hotplug_disable_offlining(void)
+{
+ cpu_maps_update_begin();
+ cpu_hotplug_offline_disabled = true;
+ cpu_maps_update_done();
+}
+
/*
* Wait for currently running CPU hotplug operations to complete (if any) and
* disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
@@ -1471,7 +1481,7 @@ static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
* If the platform does not support hotplug, report it explicitly to
* differentiate it from a transient offlining failure.
*/
- if (cc_platform_has(CC_ATTR_HOTPLUG_DISABLED))
+ if (cpu_hotplug_offline_disabled)
return -EOPNOTSUPP;
if (cpu_hotplug_disabled)
return -EBUSY;
@@ -1894,8 +1904,8 @@ int freeze_secondary_cpus(int primary)
cpumask_clear(frozen_cpus);
pr_info("Disabling non-boot CPUs ...\n");
- for_each_online_cpu(cpu) {
- if (cpu == primary)
+ for (cpu = nr_cpu_ids - 1; cpu >= 0; cpu--) {
+ if (!cpu_online(cpu) || cpu == primary)
continue;
if (pm_wakeup_pending()) {
@@ -3072,6 +3082,9 @@ EXPORT_SYMBOL(__cpu_possible_mask);
struct cpumask __cpu_online_mask __read_mostly;
EXPORT_SYMBOL(__cpu_online_mask);
+struct cpumask __cpu_enabled_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_enabled_mask);
+
struct cpumask __cpu_present_mask __read_mostly;
EXPORT_SYMBOL(__cpu_present_mask);
diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c
index 5b2722a93a48..d3b4cd12bdd1 100644
--- a/kernel/crash_reserve.c
+++ b/kernel/crash_reserve.c
@@ -13,7 +13,6 @@
#include <linux/memory.h>
#include <linux/cpuhotplug.h>
#include <linux/memblock.h>
-#include <linux/kexec.h>
#include <linux/kmemleak.h>
#include <asm/page.h>
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 10b454554ab0..137ba73f56fc 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -144,7 +144,7 @@ kdb_bt(int argc, const char **argv)
kdb_ps_suppressed();
/* Run the active tasks first */
for_each_online_cpu(cpu) {
- p = kdb_curr_task(cpu);
+ p = curr_task(cpu);
if (kdb_bt1(p, mask, btaprompt))
return 0;
}
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 3131334d7a81..6a77f1c779c4 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -206,7 +206,7 @@ char kdb_getchar(void)
*/
static void kdb_position_cursor(char *prompt, char *buffer, char *cp)
{
- kdb_printf("\r%s", kdb_prompt_str);
+ kdb_printf("\r%s", prompt);
if (cp > buffer)
kdb_printf("%.*s", (int)(cp - buffer), buffer);
}
@@ -362,7 +362,7 @@ poll_again:
if (i >= dtab_count)
kdb_printf("...");
kdb_printf("\n");
- kdb_printf(kdb_prompt_str);
+ kdb_printf("%s", kdb_prompt_str);
kdb_printf("%s", buffer);
if (cp != lastchar)
kdb_position_cursor(kdb_prompt_str, buffer, cp);
@@ -453,7 +453,7 @@ char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt)
{
if (prompt && kdb_prompt_str != prompt)
strscpy(kdb_prompt_str, prompt, CMD_BUFLEN);
- kdb_printf(kdb_prompt_str);
+ kdb_printf("%s", kdb_prompt_str);
kdb_nextline = 1; /* Prompt and input resets line number */
return kdb_read(buffer, bufsize);
}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 664bae55f2c9..f5f7d7fb5936 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -155,16 +155,6 @@ static char *__env[31] = {
static const int __nenv = ARRAY_SIZE(__env);
-struct task_struct *kdb_curr_task(int cpu)
-{
- struct task_struct *p = curr_task(cpu);
-#ifdef _TIF_MCA_INIT
- if ((task_thread_info(p)->flags & _TIF_MCA_INIT) && KDB_TSK(cpu))
- p = krp->p;
-#endif
- return p;
-}
-
/*
* Update the permissions flags (kdb_cmd_enabled) to match the
* current lockdown state.
@@ -1228,7 +1218,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
char *cmdbuf;
int diag;
struct task_struct *kdb_current =
- kdb_curr_task(raw_smp_processor_id());
+ curr_task(raw_smp_processor_id());
KDB_DEBUG_STATE("kdb_local 1", reason);
@@ -2278,7 +2268,7 @@ void kdb_ps_suppressed(void)
unsigned long cpu;
const struct task_struct *p, *g;
for_each_online_cpu(cpu) {
- p = kdb_curr_task(cpu);
+ p = curr_task(cpu);
if (kdb_task_state(p, "-"))
++idle;
}
@@ -2314,7 +2304,7 @@ void kdb_ps1(const struct task_struct *p)
kdb_task_has_cpu(p), kdb_process_cpu(p),
kdb_task_state_char(p),
(void *)(&p->thread),
- p == kdb_curr_task(raw_smp_processor_id()) ? '*' : ' ',
+ p == curr_task(raw_smp_processor_id()) ? '*' : ' ',
p->comm);
if (kdb_task_has_cpu(p)) {
if (!KDB_TSK(cpu)) {
@@ -2350,7 +2340,7 @@ static int kdb_ps(int argc, const char **argv)
for_each_online_cpu(cpu) {
if (KDB_FLAG(CMD_INTERRUPT))
return 0;
- p = kdb_curr_task(cpu);
+ p = curr_task(cpu);
if (kdb_task_state(p, mask))
kdb_ps1(p);
}
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 548fd4059bf9..d2520d72b1f5 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -210,8 +210,6 @@ extern void kdb_gdb_state_pass(char *buf);
#define KDB_TSK(cpu) kgdb_info[cpu].task
#define KDB_TSKREGS(cpu) kgdb_info[cpu].debuggerinfo
-extern struct task_struct *kdb_curr_task(int);
-
#define kdb_task_has_cpu(p) (task_curr(p))
#define GFP_KDB (in_dbg_master() ? GFP_ATOMIC : GFP_KERNEL)
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index e039b0f99a0b..dead51de8eb5 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -44,7 +44,7 @@ void delayacct_init(void)
}
#ifdef CONFIG_PROC_SYSCTL
-static int sysctl_delayacct(struct ctl_table *table, int write, void *buffer,
+static int sysctl_delayacct(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int state = delayacct_on;
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 4d543b1e9d57..4480a3cd92e0 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -404,9 +404,7 @@ void dma_direct_sync_sg_for_device(struct device *dev,
for_each_sg(sgl, sg, nents, i) {
phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
- if (unlikely(is_swiotlb_buffer(dev, paddr)))
- swiotlb_sync_single_for_device(dev, paddr, sg->length,
- dir);
+ swiotlb_sync_single_for_device(dev, paddr, sg->length, dir);
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_device(paddr, sg->length,
@@ -430,9 +428,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_cpu(paddr, sg->length, dir);
- if (unlikely(is_swiotlb_buffer(dev, paddr)))
- swiotlb_sync_single_for_cpu(dev, paddr, sg->length,
- dir);
+ swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir);
if (dir == DMA_FROM_DEVICE)
arch_dma_mark_clean(paddr, sg->length);
@@ -640,7 +636,7 @@ size_t dma_direct_max_mapping_size(struct device *dev)
bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr)
{
return !dev_is_dma_coherent(dev) ||
- is_swiotlb_buffer(dev, dma_to_phys(dev, dma_addr));
+ swiotlb_find_pool(dev, dma_to_phys(dev, dma_addr));
}
/**
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index 18d346118fe8..d2c0b7e632fc 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -58,8 +58,7 @@ static inline void dma_direct_sync_single_for_device(struct device *dev,
{
phys_addr_t paddr = dma_to_phys(dev, addr);
- if (unlikely(is_swiotlb_buffer(dev, paddr)))
- swiotlb_sync_single_for_device(dev, paddr, size, dir);
+ swiotlb_sync_single_for_device(dev, paddr, size, dir);
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_device(paddr, size, dir);
@@ -75,8 +74,7 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
arch_sync_dma_for_cpu_all();
}
- if (unlikely(is_swiotlb_buffer(dev, paddr)))
- swiotlb_sync_single_for_cpu(dev, paddr, size, dir);
+ swiotlb_sync_single_for_cpu(dev, paddr, size, dir);
if (dir == DMA_FROM_DEVICE)
arch_dma_mark_clean(paddr, size);
@@ -121,8 +119,7 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
dma_direct_sync_single_for_cpu(dev, addr, size, dir);
- if (unlikely(is_swiotlb_buffer(dev, phys)))
- swiotlb_tbl_unmap_single(dev, phys, size, dir,
+ swiotlb_tbl_unmap_single(dev, phys, size, dir,
attrs | DMA_ATTR_SKIP_CPU_SYNC);
}
#endif /* _KERNEL_DMA_DIRECT_H */
diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
index 4950e0b622b1..cc19a3efea89 100644
--- a/kernel/dma/map_benchmark.c
+++ b/kernel/dma/map_benchmark.c
@@ -89,6 +89,22 @@ static int map_benchmark_thread(void *data)
atomic64_add(map_sq, &map->sum_sq_map);
atomic64_add(unmap_sq, &map->sum_sq_unmap);
atomic64_inc(&map->loops);
+
+ /*
+ * We may test for a long time so periodically check whether
+ * we need to schedule to avoid starving the others. Otherwise
+ * we may hangup the kernel in a non-preemptible kernel when
+ * the test kthreads number >= CPU number, the test kthreads
+ * will run endless on every CPU since the thread resposible
+ * for notifying the kthread stop (in do_map_benchmark())
+ * could not be scheduled.
+ *
+ * Note this may degrade the test concurrency since the test
+ * threads may need to share the CPU time with other load
+ * in the system. So it's recommended to run this benchmark
+ * on an idle system.
+ */
+ cond_resched();
}
out:
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 81de84318ccc..b1c18058d55f 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -67,8 +67,8 @@ void dmam_free_coherent(struct device *dev, size_t size, void *vaddr,
{
struct dma_devres match_data = { size, vaddr, dma_handle };
- dma_free_coherent(dev, size, vaddr, dma_handle);
WARN_ON(devres_destroy(dev, dmam_release, dmam_match, &match_data));
+ dma_free_coherent(dev, size, vaddr, dma_handle);
}
EXPORT_SYMBOL(dmam_free_coherent);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index fe1ccb53596f..df68d29740a0 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -763,16 +763,18 @@ static void swiotlb_dyn_free(struct rcu_head *rcu)
}
/**
- * swiotlb_find_pool() - find the IO TLB pool for a physical address
+ * __swiotlb_find_pool() - find the IO TLB pool for a physical address
* @dev: Device which has mapped the DMA buffer.
* @paddr: Physical address within the DMA buffer.
*
* Find the IO TLB memory pool descriptor which contains the given physical
- * address, if any.
+ * address, if any. This function is for use only when the dev is known to
+ * be using swiotlb. Use swiotlb_find_pool() for the more general case
+ * when this condition is not met.
*
* Return: Memory pool which contains @paddr, or %NULL if none.
*/
-struct io_tlb_pool *swiotlb_find_pool(struct device *dev, phys_addr_t paddr)
+struct io_tlb_pool *__swiotlb_find_pool(struct device *dev, phys_addr_t paddr)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
struct io_tlb_pool *pool;
@@ -855,9 +857,8 @@ static unsigned int swiotlb_align_offset(struct device *dev,
* Bounce: copy the swiotlb buffer from or back to the original dma location
*/
static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size,
- enum dma_data_direction dir)
+ enum dma_data_direction dir, struct io_tlb_pool *mem)
{
- struct io_tlb_pool *mem = swiotlb_find_pool(dev, tlb_addr);
int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT;
phys_addr_t orig_addr = mem->slots[index].orig_addr;
size_t alloc_size = mem->slots[index].alloc_size;
@@ -1243,7 +1244,7 @@ found:
* that was made by swiotlb_dyn_alloc() on a third CPU (cf. multicopy
* atomicity).
*
- * See also the comment in is_swiotlb_buffer().
+ * See also the comment in swiotlb_find_pool().
*/
smp_mb();
@@ -1435,13 +1436,13 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
* hardware behavior. Use of swiotlb is supposed to be transparent,
* i.e. swiotlb must not corrupt memory by clobbering unwritten bytes.
*/
- swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
+ swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE, pool);
return tlb_addr;
}
-static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
+static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr,
+ struct io_tlb_pool *mem)
{
- struct io_tlb_pool *mem = swiotlb_find_pool(dev, tlb_addr);
unsigned long flags;
unsigned int offset = swiotlb_align_offset(dev, 0, tlb_addr);
int index, nslots, aindex;
@@ -1499,17 +1500,16 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
* swiotlb_del_transient() - delete a transient memory pool
* @dev: Device which mapped the buffer.
* @tlb_addr: Physical address within a bounce buffer.
+ * @pool: Pointer to the transient memory pool to be checked and deleted.
*
* Check whether the address belongs to a transient SWIOTLB memory pool.
* If yes, then delete the pool.
*
* Return: %true if @tlb_addr belonged to a transient pool that was released.
*/
-static bool swiotlb_del_transient(struct device *dev, phys_addr_t tlb_addr)
+static bool swiotlb_del_transient(struct device *dev, phys_addr_t tlb_addr,
+ struct io_tlb_pool *pool)
{
- struct io_tlb_pool *pool;
-
- pool = swiotlb_find_pool(dev, tlb_addr);
if (!pool->transient)
return false;
@@ -1522,7 +1522,7 @@ static bool swiotlb_del_transient(struct device *dev, phys_addr_t tlb_addr)
#else /* !CONFIG_SWIOTLB_DYNAMIC */
static inline bool swiotlb_del_transient(struct device *dev,
- phys_addr_t tlb_addr)
+ phys_addr_t tlb_addr, struct io_tlb_pool *pool)
{
return false;
}
@@ -1532,36 +1532,39 @@ static inline bool swiotlb_del_transient(struct device *dev,
/*
* tlb_addr is the physical address of the bounce buffer to unmap.
*/
-void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr,
- size_t mapping_size, enum dma_data_direction dir,
- unsigned long attrs)
+void __swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr,
+ size_t mapping_size, enum dma_data_direction dir,
+ unsigned long attrs, struct io_tlb_pool *pool)
{
/*
* First, sync the memory before unmapping the entry
*/
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
- swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_FROM_DEVICE);
+ swiotlb_bounce(dev, tlb_addr, mapping_size,
+ DMA_FROM_DEVICE, pool);
- if (swiotlb_del_transient(dev, tlb_addr))
+ if (swiotlb_del_transient(dev, tlb_addr, pool))
return;
- swiotlb_release_slots(dev, tlb_addr);
+ swiotlb_release_slots(dev, tlb_addr, pool);
}
-void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
- size_t size, enum dma_data_direction dir)
+void __swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
+ size_t size, enum dma_data_direction dir,
+ struct io_tlb_pool *pool)
{
if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
- swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE);
+ swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE, pool);
else
BUG_ON(dir != DMA_FROM_DEVICE);
}
-void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr,
- size_t size, enum dma_data_direction dir)
+void __swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr,
+ size_t size, enum dma_data_direction dir,
+ struct io_tlb_pool *pool)
{
if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
- swiotlb_bounce(dev, tlb_addr, size, DMA_FROM_DEVICE);
+ swiotlb_bounce(dev, tlb_addr, size, DMA_FROM_DEVICE, pool);
else
BUG_ON(dir != DMA_TO_DEVICE);
}
@@ -1585,8 +1588,9 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
/* Ensure that the address returned is DMA'ble */
dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr);
if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
- swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir,
- attrs | DMA_ATTR_SKIP_CPU_SYNC);
+ __swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir,
+ attrs | DMA_ATTR_SKIP_CPU_SYNC,
+ swiotlb_find_pool(dev, swiotlb_addr));
dev_WARN_ONCE(dev, 1,
"swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
&dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
@@ -1764,7 +1768,7 @@ struct page *swiotlb_alloc(struct device *dev, size_t size)
if (unlikely(!PAGE_ALIGNED(tlb_addr))) {
dev_WARN_ONCE(dev, 1, "Cannot allocate pages from non page-aligned swiotlb addr 0x%pa.\n",
&tlb_addr);
- swiotlb_release_slots(dev, tlb_addr);
+ swiotlb_release_slots(dev, tlb_addr, pool);
return NULL;
}
@@ -1774,11 +1778,13 @@ struct page *swiotlb_alloc(struct device *dev, size_t size)
bool swiotlb_free(struct device *dev, struct page *page, size_t size)
{
phys_addr_t tlb_addr = page_to_phys(page);
+ struct io_tlb_pool *pool;
- if (!is_swiotlb_buffer(dev, tlb_addr))
+ pool = swiotlb_find_pool(dev, tlb_addr);
+ if (!pool)
return false;
- swiotlb_release_slots(dev, tlb_addr);
+ swiotlb_release_slots(dev, tlb_addr, pool);
return true;
}
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 1273be84392c..8a47e52a454f 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -11,6 +11,7 @@
#include <linux/perf_event.h>
#include <linux/slab.h>
#include <linux/sched/task_stack.h>
+#include <linux/uprobes.h>
#include "internal.h"
@@ -29,7 +30,7 @@ static inline size_t perf_callchain_entry__sizeof(void)
sysctl_perf_event_max_contexts_per_stack));
}
-static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
+static DEFINE_PER_CPU(u8, callchain_recursion[PERF_NR_CONTEXTS]);
static atomic_t nr_callchain_events;
static DEFINE_MUTEX(callchain_mutex);
static struct callchain_cpus_entries *callchain_cpus_entries;
@@ -176,13 +177,51 @@ put_callchain_entry(int rctx)
put_recursion_context(this_cpu_ptr(callchain_recursion), rctx);
}
+static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entry,
+ int start_entry_idx)
+{
+#ifdef CONFIG_UPROBES
+ struct uprobe_task *utask = current->utask;
+ struct return_instance *ri;
+ __u64 *cur_ip, *last_ip, tramp_addr;
+
+ if (likely(!utask || !utask->return_instances))
+ return;
+
+ cur_ip = &entry->ip[start_entry_idx];
+ last_ip = &entry->ip[entry->nr - 1];
+ ri = utask->return_instances;
+ tramp_addr = uprobe_get_trampoline_vaddr();
+
+ /*
+ * If there are pending uretprobes for the current thread, they are
+ * recorded in a list inside utask->return_instances; each such
+ * pending uretprobe replaces traced user function's return address on
+ * the stack, so when stack trace is captured, instead of seeing
+ * actual function's return address, we'll have one or many uretprobe
+ * trampoline addresses in the stack trace, which are not helpful and
+ * misleading to users.
+ * So here we go over the pending list of uretprobes, and each
+ * encountered trampoline address is replaced with actual return
+ * address.
+ */
+ while (ri && cur_ip <= last_ip) {
+ if (*cur_ip == tramp_addr) {
+ *cur_ip = ri->orig_ret_vaddr;
+ ri = ri->next;
+ }
+ cur_ip++;
+ }
+#endif
+}
+
struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
u32 max_stack, bool crosstask, bool add_mark)
{
struct perf_callchain_entry *entry;
struct perf_callchain_entry_ctx ctx;
- int rctx;
+ int rctx, start_entry_idx;
entry = get_callchain_entry(&rctx);
if (!entry)
@@ -215,7 +254,9 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
+ start_entry_idx = entry->nr;
perf_callchain_user(&ctx, regs);
+ fixup_uretprobe_trampoline_entries(entry, start_entry_idx);
}
}
@@ -229,7 +270,7 @@ exit_put:
* Used for sysctl_perf_event_max_stack and
* sysctl_perf_event_max_contexts_per_stack.
*/
-int perf_event_max_stack_handler(struct ctl_table *table, int write,
+int perf_event_max_stack_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int *value = table->data;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 8f908f077935..aa3450bdc227 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -450,7 +450,7 @@ static void update_perf_cpu_limits(void)
static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);
-int perf_event_max_sample_rate_handler(struct ctl_table *table, int write,
+int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -474,7 +474,7 @@ int perf_event_max_sample_rate_handler(struct ctl_table *table, int write,
int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
-int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
+int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
@@ -534,7 +534,7 @@ void perf_sample_event_took(u64 sample_len_ns)
__this_cpu_write(running_sample_length, running_len);
/*
- * Note: this will be biased artifically low until we have
+ * Note: this will be biased artificially low until we have
* seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
* from having to maintain a count.
*/
@@ -596,10 +596,10 @@ static inline u64 perf_event_clock(struct perf_event *event)
*
* Event groups make things a little more complicated, but not terribly so. The
* rules for a group are that if the group leader is OFF the entire group is
- * OFF, irrespecive of what the group member states are. This results in
+ * OFF, irrespective of what the group member states are. This results in
* __perf_effective_state().
*
- * A futher ramification is that when a group leader flips between OFF and
+ * A further ramification is that when a group leader flips between OFF and
* !OFF, we need to update all group member times.
*
*
@@ -891,7 +891,7 @@ static int perf_cgroup_ensure_storage(struct perf_event *event,
int cpu, heap_size, ret = 0;
/*
- * Allow storage to have sufficent space for an iterator for each
+ * Allow storage to have sufficient space for an iterator for each
* possibly nested cgroup plus an iterator for events with no cgroup.
*/
for (heap_size = 1; css; css = css->parent)
@@ -2283,21 +2283,6 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
state = PERF_EVENT_STATE_OFF;
}
- if (event->pending_sigtrap) {
- bool dec = true;
-
- event->pending_sigtrap = 0;
- if (state != PERF_EVENT_STATE_OFF &&
- !event->pending_work) {
- event->pending_work = 1;
- dec = false;
- WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
- task_work_add(current, &event->pending_task, TWA_RESUME);
- }
- if (dec)
- local_dec(&event->ctx->nr_pending);
- }
-
perf_event_set_state(event, state);
if (!is_software_event(event))
@@ -2466,7 +2451,7 @@ static void __perf_event_disable(struct perf_event *event,
* hold the top-level event's child_mutex, so any descendant that
* goes to exit will block in perf_event_exit_event().
*
- * When called from perf_pending_irq it's OK because event->ctx
+ * When called from perf_pending_disable it's OK because event->ctx
* is the current context on this CPU and preemption is disabled,
* hence we can't get into perf_event_task_sched_out for this context.
*/
@@ -2506,7 +2491,7 @@ EXPORT_SYMBOL_GPL(perf_event_disable);
void perf_event_disable_inatomic(struct perf_event *event)
{
event->pending_disable = 1;
- irq_work_queue(&event->pending_irq);
+ irq_work_queue(&event->pending_disable_irq);
}
#define MAX_INTERRUPTS (~0ULL)
@@ -3686,7 +3671,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
perf_cgroup_switch(next);
}
-static bool perf_less_group_idx(const void *l, const void *r)
+static bool perf_less_group_idx(const void *l, const void *r, void __always_unused *args)
{
const struct perf_event *le = *(const struct perf_event **)l;
const struct perf_event *re = *(const struct perf_event **)r;
@@ -3694,20 +3679,21 @@ static bool perf_less_group_idx(const void *l, const void *r)
return le->group_index < re->group_index;
}
-static void swap_ptr(void *l, void *r)
+static void swap_ptr(void *l, void *r, void __always_unused *args)
{
void **lp = l, **rp = r;
swap(*lp, *rp);
}
+DEFINE_MIN_HEAP(struct perf_event *, perf_event_min_heap);
+
static const struct min_heap_callbacks perf_min_heap = {
- .elem_size = sizeof(struct perf_event *),
.less = perf_less_group_idx,
.swp = swap_ptr,
};
-static void __heap_add(struct min_heap *heap, struct perf_event *event)
+static void __heap_add(struct perf_event_min_heap *heap, struct perf_event *event)
{
struct perf_event **itrs = heap->data;
@@ -3741,7 +3727,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx = NULL;
/* Space for per CPU and/or any CPU event iterators. */
struct perf_event *itrs[2];
- struct min_heap event_heap;
+ struct perf_event_min_heap event_heap;
struct perf_event **evt;
int ret;
@@ -3750,7 +3736,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx,
if (!ctx->task) {
cpuctx = this_cpu_ptr(&perf_cpu_context);
- event_heap = (struct min_heap){
+ event_heap = (struct perf_event_min_heap){
.data = cpuctx->heap,
.nr = 0,
.size = cpuctx->heap_size,
@@ -3763,7 +3749,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx,
css = &cpuctx->cgrp->css;
#endif
} else {
- event_heap = (struct min_heap){
+ event_heap = (struct perf_event_min_heap){
.data = itrs,
.nr = 0,
.size = ARRAY_SIZE(itrs),
@@ -3785,7 +3771,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx,
perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu);
}
- min_heapify_all(&event_heap, &perf_min_heap);
+ min_heapify_all(&event_heap, &perf_min_heap, NULL);
while (event_heap.nr) {
ret = func(*evt, data);
@@ -3794,9 +3780,9 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx,
*evt = perf_event_groups_next(*evt, pmu);
if (*evt)
- min_heapify(&event_heap, 0, &perf_min_heap);
+ min_heap_sift_down(&event_heap, 0, &perf_min_heap, NULL);
else
- min_heap_pop(&event_heap, &perf_min_heap);
+ min_heap_pop(&event_heap, &perf_min_heap, NULL);
}
return 0;
@@ -5206,9 +5192,35 @@ static bool exclusive_event_installable(struct perf_event *event,
static void perf_addr_filters_splice(struct perf_event *event,
struct list_head *head);
+static void perf_pending_task_sync(struct perf_event *event)
+{
+ struct callback_head *head = &event->pending_task;
+
+ if (!event->pending_work)
+ return;
+ /*
+ * If the task is queued to the current task's queue, we
+ * obviously can't wait for it to complete. Simply cancel it.
+ */
+ if (task_work_cancel(current, head)) {
+ event->pending_work = 0;
+ local_dec(&event->ctx->nr_pending);
+ return;
+ }
+
+ /*
+ * All accesses related to the event are within the same RCU section in
+ * perf_pending_task(). The RCU grace period before the event is freed
+ * will make sure all those accesses are complete by then.
+ */
+ rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE);
+}
+
static void _free_event(struct perf_event *event)
{
irq_work_sync(&event->pending_irq);
+ irq_work_sync(&event->pending_disable_irq);
+ perf_pending_task_sync(event);
unaccount_event(event);
@@ -6509,6 +6521,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
return -EINVAL;
nr_pages = vma_size / PAGE_SIZE;
+ if (nr_pages > INT_MAX)
+ return -ENOMEM;
mutex_lock(&event->mmap_mutex);
ret = -EINVAL;
@@ -6750,7 +6764,7 @@ static void perf_sigtrap(struct perf_event *event)
/*
* Deliver the pending work in-event-context or follow the context.
*/
-static void __perf_pending_irq(struct perf_event *event)
+static void __perf_pending_disable(struct perf_event *event)
{
int cpu = READ_ONCE(event->oncpu);
@@ -6765,11 +6779,6 @@ static void __perf_pending_irq(struct perf_event *event)
* Yay, we hit home and are in the context of the event.
*/
if (cpu == smp_processor_id()) {
- if (event->pending_sigtrap) {
- event->pending_sigtrap = 0;
- perf_sigtrap(event);
- local_dec(&event->ctx->nr_pending);
- }
if (event->pending_disable) {
event->pending_disable = 0;
perf_event_disable_local(event);
@@ -6793,11 +6802,26 @@ static void __perf_pending_irq(struct perf_event *event)
* irq_work_queue(); // FAILS
*
* irq_work_run()
- * perf_pending_irq()
+ * perf_pending_disable()
*
* But the event runs on CPU-B and wants disabling there.
*/
- irq_work_queue_on(&event->pending_irq, cpu);
+ irq_work_queue_on(&event->pending_disable_irq, cpu);
+}
+
+static void perf_pending_disable(struct irq_work *entry)
+{
+ struct perf_event *event = container_of(entry, struct perf_event, pending_disable_irq);
+ int rctx;
+
+ /*
+ * If we 'fail' here, that's OK, it means recursion is already disabled
+ * and we won't recurse 'further'.
+ */
+ rctx = perf_swevent_get_recursion_context();
+ __perf_pending_disable(event);
+ if (rctx >= 0)
+ perf_swevent_put_recursion_context(rctx);
}
static void perf_pending_irq(struct irq_work *entry)
@@ -6820,8 +6844,6 @@ static void perf_pending_irq(struct irq_work *entry)
perf_event_wakeup(event);
}
- __perf_pending_irq(event);
-
if (rctx >= 0)
perf_swevent_put_recursion_context(rctx);
}
@@ -6832,23 +6854,27 @@ static void perf_pending_task(struct callback_head *head)
int rctx;
/*
+ * All accesses to the event must belong to the same implicit RCU read-side
+ * critical section as the ->pending_work reset. See comment in
+ * perf_pending_task_sync().
+ */
+ rcu_read_lock();
+ /*
* If we 'fail' here, that's OK, it means recursion is already disabled
* and we won't recurse 'further'.
*/
- preempt_disable_notrace();
rctx = perf_swevent_get_recursion_context();
if (event->pending_work) {
event->pending_work = 0;
perf_sigtrap(event);
local_dec(&event->ctx->nr_pending);
+ rcuwait_wake_up(&event->pending_work_wait);
}
+ rcu_read_unlock();
if (rctx >= 0)
perf_swevent_put_recursion_context(rctx);
- preempt_enable_notrace();
-
- put_event(event);
}
#ifdef CONFIG_GUEST_PERF_EVENTS
@@ -7609,7 +7635,7 @@ again:
pte = ptep_get_lockless(ptep);
if (pte_present(pte))
- size = pte_leaf_size(pte);
+ size = __pte_leaf_size(pmd, pte);
pte_unmap(ptep);
#endif /* CONFIG_HAVE_GUP_FAST */
@@ -9302,21 +9328,19 @@ static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
int i;
- if (prog->aux->func_cnt == 0) {
- perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
- (u64)(unsigned long)prog->bpf_func,
- prog->jited_len, unregister,
- prog->aux->ksym.name);
- } else {
- for (i = 0; i < prog->aux->func_cnt; i++) {
- struct bpf_prog *subprog = prog->aux->func[i];
-
- perf_event_ksymbol(
- PERF_RECORD_KSYMBOL_TYPE_BPF,
- (u64)(unsigned long)subprog->bpf_func,
- subprog->jited_len, unregister,
- subprog->aux->ksym.name);
- }
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
+ (u64)(unsigned long)prog->bpf_func,
+ prog->jited_len, unregister,
+ prog->aux->ksym.name);
+
+ for (i = 1; i < prog->aux->func_cnt; i++) {
+ struct bpf_prog *subprog = prog->aux->func[i];
+
+ perf_event_ksymbol(
+ PERF_RECORD_KSYMBOL_TYPE_BPF,
+ (u64)(unsigned long)subprog->bpf_func,
+ subprog->jited_len, unregister,
+ subprog->aux->ksym.name);
}
}
@@ -9706,16 +9730,26 @@ static int __perf_event_overflow(struct perf_event *event,
*/
bool valid_sample = sample_is_allowed(event, regs);
unsigned int pending_id = 1;
+ enum task_work_notify_mode notify_mode;
if (regs)
pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1;
- if (!event->pending_sigtrap) {
- event->pending_sigtrap = pending_id;
+
+ notify_mode = in_nmi() ? TWA_NMI_CURRENT : TWA_RESUME;
+
+ if (!event->pending_work &&
+ !task_work_add(current, &event->pending_task, notify_mode)) {
+ event->pending_work = pending_id;
local_inc(&event->ctx->nr_pending);
+
+ event->pending_addr = 0;
+ if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
+ event->pending_addr = data->addr;
+
} else if (event->attr.exclude_kernel && valid_sample) {
/*
* Should not be able to return to user space without
- * consuming pending_sigtrap; with exceptions:
+ * consuming pending_work; with exceptions:
*
* 1. Where !exclude_kernel, events can overflow again
* in the kernel without returning to user space.
@@ -9725,13 +9759,8 @@ static int __perf_event_overflow(struct perf_event *event,
* To approximate progress (with false negatives),
* check 32-bit hash of the current IP.
*/
- WARN_ON_ONCE(event->pending_sigtrap != pending_id);
+ WARN_ON_ONCE(event->pending_work != pending_id);
}
-
- event->pending_addr = 0;
- if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
- event->pending_addr = data->addr;
- irq_work_queue(&event->pending_irq);
}
READ_ONCE(event->overflow_handler)(event, data, regs);
@@ -9759,11 +9788,7 @@ struct swevent_htable {
struct swevent_hlist *swevent_hlist;
struct mutex hlist_mutex;
int hlist_refcount;
-
- /* Recursion avoidance in each contexts */
- int recursion[PERF_NR_CONTEXTS];
};
-
static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
/*
@@ -9961,17 +9986,13 @@ DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
int perf_swevent_get_recursion_context(void)
{
- struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
-
- return get_recursion_context(swhash->recursion);
+ return get_recursion_context(current->perf_recursion);
}
EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
void perf_swevent_put_recursion_context(int rctx)
{
- struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
-
- put_recursion_context(swhash->recursion, rctx);
+ put_recursion_context(current->perf_recursion, rctx);
}
void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
@@ -11961,7 +11982,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
init_waitqueue_head(&event->waitq);
init_irq_work(&event->pending_irq, perf_pending_irq);
+ event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable);
init_task_work(&event->pending_task, perf_pending_task);
+ rcuwait_init(&event->pending_work_wait);
mutex_init(&event->mmap_mutex);
raw_spin_lock_init(&event->addr_filters.lock);
@@ -13637,6 +13660,7 @@ int perf_event_init_task(struct task_struct *child, u64 clone_flags)
{
int ret;
+ memset(child->perf_recursion, 0, sizeof(child->perf_recursion));
child->perf_event_ctxp = NULL;
mutex_init(&child->perf_event_mutex);
INIT_LIST_HEAD(&child->perf_event_list);
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 5150d5f84c03..451514442a1b 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -128,7 +128,7 @@ static inline unsigned long perf_data_size(struct perf_buffer *rb)
static inline unsigned long perf_aux_size(struct perf_buffer *rb)
{
- return rb->aux_nr_pages << PAGE_SHIFT;
+ return (unsigned long)rb->aux_nr_pages << PAGE_SHIFT;
}
#define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...) \
@@ -208,7 +208,7 @@ arch_perf_out_copy_user(void *dst, const void *src, unsigned long n)
DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
-static inline int get_recursion_context(int *recursion)
+static inline int get_recursion_context(u8 *recursion)
{
unsigned char rctx = interrupt_context_level();
@@ -221,7 +221,7 @@ static inline int get_recursion_context(int *recursion)
return rctx;
}
-static inline void put_recursion_context(int *recursion, int rctx)
+static inline void put_recursion_context(u8 *recursion, unsigned char rctx)
{
barrier();
recursion[rctx]--;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 4013408ce012..8cadf97bc290 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -682,13 +682,18 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
if (!has_aux(event))
return -EOPNOTSUPP;
+ if (nr_pages <= 0)
+ return -EINVAL;
+
if (!overwrite) {
/*
* Watermark defaults to half the buffer, and so does the
* max_order, to aid PMU drivers in double buffering.
*/
if (!watermark)
- watermark = nr_pages << (PAGE_SHIFT - 1);
+ watermark = min_t(unsigned long,
+ U32_MAX,
+ (unsigned long)nr_pages << (PAGE_SHIFT - 1));
/*
* Use aux_watermark as the basis for chunking to
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 2c83ba776fc7..73cc47708679 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -181,7 +181,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
if (new_page) {
folio_get(new_folio);
- folio_add_new_anon_rmap(new_folio, vma, addr);
+ folio_add_new_anon_rmap(new_folio, vma, addr, RMAP_EXCLUSIVE);
folio_add_lru_vma(new_folio, vma);
} else
/* no new page, just dec_mm_counter for old_page */
@@ -1474,11 +1474,20 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
return ret;
}
+void * __weak arch_uprobe_trampoline(unsigned long *psize)
+{
+ static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+
+ *psize = UPROBE_SWBP_INSN_SIZE;
+ return &insn;
+}
+
static struct xol_area *__create_xol_area(unsigned long vaddr)
{
struct mm_struct *mm = current->mm;
- uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+ unsigned long insns_size;
struct xol_area *area;
+ void *insns;
area = kmalloc(sizeof(*area), GFP_KERNEL);
if (unlikely(!area))
@@ -1502,7 +1511,8 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
/* Reserve the 1st slot for get_trampoline_vaddr() */
set_bit(0, area->bitmap);
atomic_set(&area->slot_count, 1);
- arch_uprobe_copy_ixol(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE);
+ insns = arch_uprobe_trampoline(&insns_size);
+ arch_uprobe_copy_ixol(area->pages[0], 0, insns, insns_size);
if (!xol_add_vma(mm, area))
return area;
@@ -1827,7 +1837,7 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags)
*
* Returns -1 in case the xol_area is not allocated.
*/
-static unsigned long get_trampoline_vaddr(void)
+unsigned long uprobe_get_trampoline_vaddr(void)
{
struct xol_area *area;
unsigned long trampoline_vaddr = -1;
@@ -1878,7 +1888,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
if (!ri)
return;
- trampoline_vaddr = get_trampoline_vaddr();
+ trampoline_vaddr = uprobe_get_trampoline_vaddr();
orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
if (orig_ret_vaddr == -1)
goto fail;
@@ -2123,7 +2133,7 @@ static struct return_instance *find_next_ret_chain(struct return_instance *ri)
return ri;
}
-static void handle_trampoline(struct pt_regs *regs)
+void uprobe_handle_trampoline(struct pt_regs *regs)
{
struct uprobe_task *utask;
struct return_instance *ri, *next;
@@ -2149,6 +2159,15 @@ static void handle_trampoline(struct pt_regs *regs)
instruction_pointer_set(regs, ri->orig_ret_vaddr);
do {
+ /* pop current instance from the stack of pending return instances,
+ * as it's not pending anymore: we just fixed up original
+ * instruction pointer in regs and are about to call handlers;
+ * this allows fixup_uretprobe_trampoline_entries() to properly fix up
+ * captured stack traces from uretprobe handlers, in which pending
+ * trampoline addresses on the stack are replaced with correct
+ * original return addresses
+ */
+ utask->return_instances = ri->next;
if (valid)
handle_uretprobe_chain(ri, regs);
ri = free_ret_instance(ri);
@@ -2187,8 +2206,8 @@ static void handle_swbp(struct pt_regs *regs)
int is_swbp;
bp_vaddr = uprobe_get_swbp_addr(regs);
- if (bp_vaddr == get_trampoline_vaddr())
- return handle_trampoline(regs);
+ if (bp_vaddr == uprobe_get_trampoline_vaddr())
+ return uprobe_handle_trampoline(regs);
uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
if (!uprobe) {
diff --git a/kernel/exit.c b/kernel/exit.c
index f95a2c1338a8..7430852a8571 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -277,7 +277,6 @@ repeat:
}
write_unlock_irq(&tasklist_lock);
- seccomp_filter_release(p);
proc_flush_pid(thread_pid);
put_pid(thread_pid);
release_thread(p);
@@ -439,14 +438,46 @@ static void coredump_task_exit(struct task_struct *tsk)
}
#ifdef CONFIG_MEMCG
+/* drops tasklist_lock if succeeds */
+static bool __try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm)
+{
+ bool ret = false;
+
+ task_lock(tsk);
+ if (likely(tsk->mm == mm)) {
+ /* tsk can't pass exit_mm/exec_mmap and exit */
+ read_unlock(&tasklist_lock);
+ WRITE_ONCE(mm->owner, tsk);
+ lru_gen_migrate_mm(mm);
+ ret = true;
+ }
+ task_unlock(tsk);
+ return ret;
+}
+
+static bool try_to_set_owner(struct task_struct *g, struct mm_struct *mm)
+{
+ struct task_struct *t;
+
+ for_each_thread(g, t) {
+ struct mm_struct *t_mm = READ_ONCE(t->mm);
+ if (t_mm == mm) {
+ if (__try_to_set_owner(t, mm))
+ return true;
+ } else if (t_mm)
+ break;
+ }
+
+ return false;
+}
+
/*
* A task is exiting. If it owned this mm, find a new owner for the mm.
*/
void mm_update_next_owner(struct mm_struct *mm)
{
- struct task_struct *c, *g, *p = current;
+ struct task_struct *g, *p = current;
-retry:
/*
* If the exiting or execing task is not the owner, it's
* someone else's problem.
@@ -467,31 +498,27 @@ retry:
/*
* Search in the children
*/
- list_for_each_entry(c, &p->children, sibling) {
- if (c->mm == mm)
- goto assign_new_owner;
+ list_for_each_entry(g, &p->children, sibling) {
+ if (try_to_set_owner(g, mm))
+ goto ret;
}
-
/*
* Search in the siblings
*/
- list_for_each_entry(c, &p->real_parent->children, sibling) {
- if (c->mm == mm)
- goto assign_new_owner;
+ list_for_each_entry(g, &p->real_parent->children, sibling) {
+ if (try_to_set_owner(g, mm))
+ goto ret;
}
-
/*
* Search through everything else, we should not get here often.
*/
for_each_process(g) {
+ if (atomic_read(&mm->mm_users) <= 1)
+ break;
if (g->flags & PF_KTHREAD)
continue;
- for_each_thread(g, c) {
- if (c->mm == mm)
- goto assign_new_owner;
- if (c->mm)
- break;
- }
+ if (try_to_set_owner(g, mm))
+ goto ret;
}
read_unlock(&tasklist_lock);
/*
@@ -500,30 +527,9 @@ retry:
* ptrace or page migration (get_task_mm()). Mark owner as NULL.
*/
WRITE_ONCE(mm->owner, NULL);
+ ret:
return;
-assign_new_owner:
- BUG_ON(c == p);
- get_task_struct(c);
- /*
- * The task_lock protects c->mm from changing.
- * We always want mm->owner->mm == mm
- */
- task_lock(c);
- /*
- * Delay read_unlock() till we have the task_lock()
- * to ensure that c does not slip away underneath us
- */
- read_unlock(&tasklist_lock);
- if (c->mm != mm) {
- task_unlock(c);
- put_task_struct(c);
- goto retry;
- }
- WRITE_ONCE(mm->owner, c);
- lru_gen_migrate_mm(mm);
- task_unlock(c);
- put_task_struct(c);
}
#endif /* CONFIG_MEMCG */
@@ -832,6 +838,8 @@ void __noreturn do_exit(long code)
io_uring_files_cancel();
exit_signals(tsk); /* sets PF_EXITING */
+ seccomp_filter_release(tsk);
+
acct_update_integrals(tsk);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
diff --git a/kernel/fork.c b/kernel/fork.c
index 99076dbe27d8..cc760491f201 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -44,6 +44,7 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
+#include <linux/memblock.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
@@ -115,6 +116,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/task.h>
+#include <kunit/visibility.h>
+
/*
* Minimum number of threads to boot the kernel
*/
@@ -205,9 +208,10 @@ static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
unsigned int i;
for (i = 0; i < NR_CACHED_STACKS; i++) {
- if (this_cpu_cmpxchg(cached_stacks[i], NULL, vm) != NULL)
- continue;
- return true;
+ struct vm_struct *tmp = NULL;
+
+ if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm))
+ return true;
}
return false;
}
@@ -616,12 +620,6 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
exe_file = get_mm_exe_file(oldmm);
RCU_INIT_POINTER(mm->exe_file, exe_file);
- /*
- * We depend on the oldmm having properly denied write access to the
- * exe_file already.
- */
- if (exe_file && deny_write_access(exe_file))
- pr_warn_once("deny_write_access() failed in %s\n", __func__);
}
#ifdef CONFIG_MMU
@@ -996,10 +994,10 @@ void __init __weak arch_task_cache_init(void) { }
/*
* set_max_threads
*/
-static void set_max_threads(unsigned int max_threads_suggested)
+static void __init set_max_threads(unsigned int max_threads_suggested)
{
u64 threads;
- unsigned long nr_pages = totalram_pages();
+ unsigned long nr_pages = PHYS_PFN(memblock_phys_mem_size() - memblock_reserved_size());
/*
* The number of threads shall be limited such that the thread
@@ -1022,7 +1020,7 @@ static void set_max_threads(unsigned int max_threads_suggested)
int arch_task_struct_size __read_mostly;
#endif
-static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
+static void __init task_struct_whitelist(unsigned long *offset, unsigned long *size)
{
/* Fetch thread_struct whitelist for the architecture. */
arch_thread_struct_whitelist(offset, size);
@@ -1334,6 +1332,7 @@ struct mm_struct *mm_alloc(void)
memset(mm, 0, sizeof(*mm));
return mm_init(mm, current, current_user_ns());
}
+EXPORT_SYMBOL_IF_KUNIT(mm_alloc);
static inline void __mmput(struct mm_struct *mm)
{
@@ -1412,20 +1411,11 @@ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
*/
old_exe_file = rcu_dereference_raw(mm->exe_file);
- if (new_exe_file) {
- /*
- * We expect the caller (i.e., sys_execve) to already denied
- * write access, so this is unlikely to fail.
- */
- if (unlikely(deny_write_access(new_exe_file)))
- return -EACCES;
+ if (new_exe_file)
get_file(new_exe_file);
- }
rcu_assign_pointer(mm->exe_file, new_exe_file);
- if (old_exe_file) {
- allow_write_access(old_exe_file);
+ if (old_exe_file)
fput(old_exe_file);
- }
return 0;
}
@@ -1464,9 +1454,6 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
return ret;
}
- ret = deny_write_access(new_exe_file);
- if (ret)
- return -EACCES;
get_file(new_exe_file);
/* set the new file */
@@ -1475,10 +1462,8 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
rcu_assign_pointer(mm->exe_file, new_exe_file);
mmap_write_unlock(mm);
- if (old_exe_file) {
- allow_write_access(old_exe_file);
+ if (old_exe_file)
fput(old_exe_file);
- }
return 0;
}
@@ -1536,14 +1521,13 @@ struct mm_struct *get_task_mm(struct task_struct *task)
{
struct mm_struct *mm;
+ if (task->flags & PF_KTHREAD)
+ return NULL;
+
task_lock(task);
mm = task->mm;
- if (mm) {
- if (task->flags & PF_KTHREAD)
- mm = NULL;
- else
- mmget(mm);
- }
+ if (mm)
+ mmget(mm);
task_unlock(task);
return mm;
}
@@ -2941,8 +2925,6 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
}
#endif
-#ifdef __ARCH_WANT_SYS_CLONE3
-
noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
struct clone_args __user *uargs,
size_t usize)
@@ -3086,6 +3068,11 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
struct kernel_clone_args kargs;
pid_t set_tid[MAX_PID_NS_LEVEL];
+#ifdef __ARCH_BROKEN_SYS_CLONE3
+#warning clone3() entry point is missing, please fix
+ return -ENOSYS;
+#endif
+
kargs.set_tid = set_tid;
err = copy_clone_args_from_user(&kargs, uargs, size);
@@ -3097,7 +3084,6 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
return kernel_clone(&kargs);
}
-#endif
void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
{
@@ -3418,7 +3404,7 @@ int unshare_files(void)
return 0;
}
-int sysctl_max_threads(struct ctl_table *table, int write,
+int sysctl_max_threads(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 1d92016b0b3c..959d99583d1c 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -127,7 +127,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
* Ok, the task did not get scheduled for more than 2 minutes,
* complain:
*/
- if (sysctl_hung_task_warnings) {
+ if (sysctl_hung_task_warnings || hung_task_call_panic) {
if (sysctl_hung_task_warnings > 0)
sysctl_hung_task_warnings--;
pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
@@ -239,7 +239,7 @@ static long hung_timeout_jiffies(unsigned long last_checked,
/*
* Process updating of timeout sysctl
*/
-static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+static int proc_dohung_task_timeout_secs(const struct ctl_table *table, int write,
void *buffer,
size_t *lenp, loff_t *ppos)
{
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index aae0402507ed..c6ffb97966be 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -9,14 +9,8 @@
static struct dentry *irq_dir;
-struct irq_bit_descr {
- unsigned int mask;
- char *name;
-};
-#define BIT_MASK_DESCR(m) { .mask = m, .name = #m }
-
-static void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state,
- const struct irq_bit_descr *sd, int size)
+void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state,
+ const struct irq_bit_descr *sd, int size)
{
int i;
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index f6e5515ee077..b3e98668f4dd 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/module.h>
#include <linux/interrupt.h>
+#include <linux/irqdomain.h>
#include <linux/device.h>
#include <linux/gfp.h>
#include <linux/irq.h>
@@ -282,3 +283,43 @@ int devm_irq_setup_generic_chip(struct device *dev, struct irq_chip_generic *gc,
}
EXPORT_SYMBOL_GPL(devm_irq_setup_generic_chip);
#endif /* CONFIG_GENERIC_IRQ_CHIP */
+
+#ifdef CONFIG_IRQ_DOMAIN
+static void devm_irq_domain_remove(struct device *dev, void *res)
+{
+ struct irq_domain **domain = res;
+
+ irq_domain_remove(*domain);
+}
+
+/**
+ * devm_irq_domain_instantiate() - Instantiate a new irq domain data for a
+ * managed device.
+ * @dev: Device to instantiate the domain for
+ * @info: Domain information pointer pointing to the information for this
+ * domain
+ *
+ * Return: A pointer to the instantiated irq domain or an ERR_PTR value.
+ */
+struct irq_domain *devm_irq_domain_instantiate(struct device *dev,
+ const struct irq_domain_info *info)
+{
+ struct irq_domain *domain;
+ struct irq_domain **dr;
+
+ dr = devres_alloc(devm_irq_domain_remove, sizeof(*dr), GFP_KERNEL);
+ if (!dr)
+ return ERR_PTR(-ENOMEM);
+
+ domain = irq_domain_instantiate(info);
+ if (!IS_ERR(domain)) {
+ *dr = domain;
+ devres_add(dev, dr);
+ } else {
+ devres_free(dr);
+ }
+
+ return domain;
+}
+EXPORT_SYMBOL_GPL(devm_irq_domain_instantiate);
+#endif /* CONFIG_IRQ_DOMAIN */
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index d39a40bc542b..32ffcbb87fa1 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -276,21 +276,14 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
}
/**
- * __irq_alloc_domain_generic_chips - Allocate generic chips for an irq domain
- * @d: irq domain for which to allocate chips
- * @irqs_per_chip: Number of interrupts each chip handles (max 32)
- * @num_ct: Number of irq_chip_type instances associated with this
- * @name: Name of the irq chip
- * @handler: Default flow handler associated with these chips
- * @clr: IRQ_* bits to clear in the mapping function
- * @set: IRQ_* bits to set in the mapping function
- * @gcflags: Generic chip specific setup flags
+ * irq_domain_alloc_generic_chips - Allocate generic chips for an irq domain
+ * @d: irq domain for which to allocate chips
+ * @info: Generic chip information
+ *
+ * Return: 0 on success, negative error code on failure
*/
-int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
- int num_ct, const char *name,
- irq_flow_handler_t handler,
- unsigned int clr, unsigned int set,
- enum irq_gc_flags gcflags)
+int irq_domain_alloc_generic_chips(struct irq_domain *d,
+ const struct irq_domain_chip_generic_info *info)
{
struct irq_domain_chip_generic *dgc;
struct irq_chip_generic *gc;
@@ -300,27 +293,29 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
size_t gc_sz;
size_t sz;
void *tmp;
+ int ret;
if (d->gc)
return -EBUSY;
- numchips = DIV_ROUND_UP(d->revmap_size, irqs_per_chip);
+ numchips = DIV_ROUND_UP(d->revmap_size, info->irqs_per_chip);
if (!numchips)
return -EINVAL;
/* Allocate a pointer, generic chip and chiptypes for each chip */
- gc_sz = struct_size(gc, chip_types, num_ct);
+ gc_sz = struct_size(gc, chip_types, info->num_ct);
dgc_sz = struct_size(dgc, gc, numchips);
sz = dgc_sz + numchips * gc_sz;
tmp = dgc = kzalloc(sz, GFP_KERNEL);
if (!dgc)
return -ENOMEM;
- dgc->irqs_per_chip = irqs_per_chip;
+ dgc->irqs_per_chip = info->irqs_per_chip;
dgc->num_chips = numchips;
- dgc->irq_flags_to_set = set;
- dgc->irq_flags_to_clear = clr;
- dgc->gc_flags = gcflags;
+ dgc->irq_flags_to_set = info->irq_flags_to_set;
+ dgc->irq_flags_to_clear = info->irq_flags_to_clear;
+ dgc->gc_flags = info->gc_flags;
+ dgc->exit = info->exit;
d->gc = dgc;
/* Calc pointer to the first generic chip */
@@ -328,15 +323,22 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
for (i = 0; i < numchips; i++) {
/* Store the pointer to the generic chip */
dgc->gc[i] = gc = tmp;
- irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip,
- NULL, handler);
+ irq_init_generic_chip(gc, info->name, info->num_ct,
+ i * dgc->irqs_per_chip, NULL,
+ info->handler);
gc->domain = d;
- if (gcflags & IRQ_GC_BE_IO) {
+ if (dgc->gc_flags & IRQ_GC_BE_IO) {
gc->reg_readl = &irq_readl_be;
gc->reg_writel = &irq_writel_be;
}
+ if (info->init) {
+ ret = info->init(gc);
+ if (ret)
+ goto err;
+ }
+
raw_spin_lock_irqsave(&gc_lock, flags);
list_add_tail(&gc->list, &gc_list);
raw_spin_unlock_irqrestore(&gc_lock, flags);
@@ -344,6 +346,69 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
tmp += gc_sz;
}
return 0;
+
+err:
+ while (i--) {
+ if (dgc->exit)
+ dgc->exit(dgc->gc[i]);
+ irq_remove_generic_chip(dgc->gc[i], ~0U, 0, 0);
+ }
+ d->gc = NULL;
+ kfree(dgc);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(irq_domain_alloc_generic_chips);
+
+/**
+ * irq_domain_remove_generic_chips - Remove generic chips from an irq domain
+ * @d: irq domain for which generic chips are to be removed
+ */
+void irq_domain_remove_generic_chips(struct irq_domain *d)
+{
+ struct irq_domain_chip_generic *dgc = d->gc;
+ unsigned int i;
+
+ if (!dgc)
+ return;
+
+ for (i = 0; i < dgc->num_chips; i++) {
+ if (dgc->exit)
+ dgc->exit(dgc->gc[i]);
+ irq_remove_generic_chip(dgc->gc[i], ~0U, 0, 0);
+ }
+ d->gc = NULL;
+ kfree(dgc);
+}
+EXPORT_SYMBOL_GPL(irq_domain_remove_generic_chips);
+
+/**
+ * __irq_alloc_domain_generic_chips - Allocate generic chips for an irq domain
+ * @d: irq domain for which to allocate chips
+ * @irqs_per_chip: Number of interrupts each chip handles (max 32)
+ * @num_ct: Number of irq_chip_type instances associated with this
+ * @name: Name of the irq chip
+ * @handler: Default flow handler associated with these chips
+ * @clr: IRQ_* bits to clear in the mapping function
+ * @set: IRQ_* bits to set in the mapping function
+ * @gcflags: Generic chip specific setup flags
+ */
+int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
+ int num_ct, const char *name,
+ irq_flow_handler_t handler,
+ unsigned int clr, unsigned int set,
+ enum irq_gc_flags gcflags)
+{
+ struct irq_domain_chip_generic_info info = {
+ .irqs_per_chip = irqs_per_chip,
+ .num_ct = num_ct,
+ .name = name,
+ .handler = handler,
+ .irq_flags_to_clear = clr,
+ .irq_flags_to_set = set,
+ .gc_flags = gcflags,
+ };
+
+ return irq_domain_alloc_generic_chips(d, &info);
}
EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index ed28059e9849..fe0272cd84a5 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -501,6 +501,16 @@ static inline struct irq_data *irqd_get_parent_data(struct irq_data *irqd)
#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
#include <linux/debugfs.h>
+struct irq_bit_descr {
+ unsigned int mask;
+ char *name;
+};
+
+#define BIT_MASK_DESCR(m) { .mask = m, .name = #m }
+
+void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state,
+ const struct irq_bit_descr *sd, int size);
+
void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc);
static inline void irq_remove_debugfs_entry(struct irq_desc *desc)
{
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index 38d6ae651ac7..3d4036db15ac 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -17,6 +17,8 @@ struct irq_sim_work_ctx {
unsigned int irq_count;
unsigned long *pending;
struct irq_domain *domain;
+ struct irq_sim_ops ops;
+ void *user_data;
};
struct irq_sim_irq_ctx {
@@ -88,6 +90,31 @@ static int irq_sim_set_irqchip_state(struct irq_data *data,
return 0;
}
+static int irq_sim_request_resources(struct irq_data *data)
+{
+ struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
+ struct irq_sim_work_ctx *work_ctx = irq_ctx->work_ctx;
+ irq_hw_number_t hwirq = irqd_to_hwirq(data);
+
+ if (work_ctx->ops.irq_sim_irq_requested)
+ return work_ctx->ops.irq_sim_irq_requested(work_ctx->domain,
+ hwirq,
+ work_ctx->user_data);
+
+ return 0;
+}
+
+static void irq_sim_release_resources(struct irq_data *data)
+{
+ struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
+ struct irq_sim_work_ctx *work_ctx = irq_ctx->work_ctx;
+ irq_hw_number_t hwirq = irqd_to_hwirq(data);
+
+ if (work_ctx->ops.irq_sim_irq_released)
+ work_ctx->ops.irq_sim_irq_released(work_ctx->domain, hwirq,
+ work_ctx->user_data);
+}
+
static struct irq_chip irq_sim_irqchip = {
.name = "irq_sim",
.irq_mask = irq_sim_irqmask,
@@ -95,6 +122,8 @@ static struct irq_chip irq_sim_irqchip = {
.irq_set_type = irq_sim_set_type,
.irq_get_irqchip_state = irq_sim_get_irqchip_state,
.irq_set_irqchip_state = irq_sim_set_irqchip_state,
+ .irq_request_resources = irq_sim_request_resources,
+ .irq_release_resources = irq_sim_release_resources,
};
static void irq_sim_handle_irq(struct irq_work *work)
@@ -164,6 +193,15 @@ static const struct irq_domain_ops irq_sim_domain_ops = {
struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode,
unsigned int num_irqs)
{
+ return irq_domain_create_sim_full(fwnode, num_irqs, NULL, NULL);
+}
+EXPORT_SYMBOL_GPL(irq_domain_create_sim);
+
+struct irq_domain *irq_domain_create_sim_full(struct fwnode_handle *fwnode,
+ unsigned int num_irqs,
+ const struct irq_sim_ops *ops,
+ void *data)
+{
struct irq_sim_work_ctx *work_ctx __free(kfree) =
kmalloc(sizeof(*work_ctx), GFP_KERNEL);
@@ -183,10 +221,14 @@ struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode,
work_ctx->irq_count = num_irqs;
work_ctx->work = IRQ_WORK_INIT_HARD(irq_sim_handle_irq);
work_ctx->pending = no_free_ptr(pending);
+ work_ctx->user_data = data;
+
+ if (ops)
+ memcpy(&work_ctx->ops, ops, sizeof(*ops));
return no_free_ptr(work_ctx)->domain;
}
-EXPORT_SYMBOL_GPL(irq_domain_create_sim);
+EXPORT_SYMBOL_GPL(irq_domain_create_sim_full);
/**
* irq_domain_remove_sim - Deinitialize the interrupt simulator domain: free
@@ -228,10 +270,22 @@ struct irq_domain *devm_irq_domain_create_sim(struct device *dev,
struct fwnode_handle *fwnode,
unsigned int num_irqs)
{
+ return devm_irq_domain_create_sim_full(dev, fwnode, num_irqs,
+ NULL, NULL);
+}
+EXPORT_SYMBOL_GPL(devm_irq_domain_create_sim);
+
+struct irq_domain *
+devm_irq_domain_create_sim_full(struct device *dev,
+ struct fwnode_handle *fwnode,
+ unsigned int num_irqs,
+ const struct irq_sim_ops *ops,
+ void *data)
+{
struct irq_domain *domain;
int ret;
- domain = irq_domain_create_sim(fwnode, num_irqs);
+ domain = irq_domain_create_sim_full(fwnode, num_irqs, ops, data);
if (IS_ERR(domain))
return domain;
@@ -241,4 +295,4 @@ struct irq_domain *devm_irq_domain_create_sim(struct device *dev,
return domain;
}
-EXPORT_SYMBOL_GPL(devm_irq_domain_create_sim);
+EXPORT_SYMBOL_GPL(devm_irq_domain_create_sim_full);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index aadc8891cc16..cea8f6874b1f 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -111,6 +111,7 @@ EXPORT_SYMBOL_GPL(__irq_domain_alloc_fwnode);
/**
* irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle
+ * @fwnode: fwnode_handle to free
*
* Free a fwnode_handle allocated with irq_domain_alloc_fwnode.
*/
@@ -127,27 +128,12 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode)
}
EXPORT_SYMBOL_GPL(irq_domain_free_fwnode);
-static struct irq_domain *__irq_domain_create(struct fwnode_handle *fwnode,
- unsigned int size,
- irq_hw_number_t hwirq_max,
- int direct_max,
- const struct irq_domain_ops *ops,
- void *host_data)
+static int irq_domain_set_name(struct irq_domain *domain,
+ const struct fwnode_handle *fwnode,
+ enum irq_domain_bus_token bus_token)
{
- struct irqchip_fwid *fwid;
- struct irq_domain *domain;
-
static atomic_t unknown_domains;
-
- if (WARN_ON((size && direct_max) ||
- (!IS_ENABLED(CONFIG_IRQ_DOMAIN_NOMAP) && direct_max) ||
- (direct_max && (direct_max != hwirq_max))))
- return NULL;
-
- domain = kzalloc_node(struct_size(domain, revmap, size),
- GFP_KERNEL, of_node_to_nid(to_of_node(fwnode)));
- if (!domain)
- return NULL;
+ struct irqchip_fwid *fwid;
if (is_fwnode_irqchip(fwnode)) {
fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
@@ -155,17 +141,23 @@ static struct irq_domain *__irq_domain_create(struct fwnode_handle *fwnode,
switch (fwid->type) {
case IRQCHIP_FWNODE_NAMED:
case IRQCHIP_FWNODE_NAMED_ID:
- domain->fwnode = fwnode;
- domain->name = kstrdup(fwid->name, GFP_KERNEL);
- if (!domain->name) {
- kfree(domain);
- return NULL;
- }
+ domain->name = bus_token ?
+ kasprintf(GFP_KERNEL, "%s-%d",
+ fwid->name, bus_token) :
+ kstrdup(fwid->name, GFP_KERNEL);
+ if (!domain->name)
+ return -ENOMEM;
domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
break;
default:
- domain->fwnode = fwnode;
domain->name = fwid->name;
+ if (bus_token) {
+ domain->name = kasprintf(GFP_KERNEL, "%s-%d",
+ fwid->name, bus_token);
+ if (!domain->name)
+ return -ENOMEM;
+ domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
+ }
break;
}
} else if (is_of_node(fwnode) || is_acpi_device_node(fwnode) ||
@@ -177,42 +169,68 @@ static struct irq_domain *__irq_domain_create(struct fwnode_handle *fwnode,
* unhappy about. Replace them with ':', which does
* the trick and is not as offensive as '\'...
*/
- name = kasprintf(GFP_KERNEL, "%pfw", fwnode);
- if (!name) {
- kfree(domain);
- return NULL;
- }
+ name = bus_token ?
+ kasprintf(GFP_KERNEL, "%pfw-%d", fwnode, bus_token) :
+ kasprintf(GFP_KERNEL, "%pfw", fwnode);
+ if (!name)
+ return -ENOMEM;
domain->name = strreplace(name, '/', ':');
- domain->fwnode = fwnode;
domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
}
if (!domain->name) {
if (fwnode)
pr_err("Invalid fwnode type for irqdomain\n");
- domain->name = kasprintf(GFP_KERNEL, "unknown-%d",
- atomic_inc_return(&unknown_domains));
- if (!domain->name) {
- kfree(domain);
- return NULL;
- }
+ domain->name = bus_token ?
+ kasprintf(GFP_KERNEL, "unknown-%d-%d",
+ atomic_inc_return(&unknown_domains),
+ bus_token) :
+ kasprintf(GFP_KERNEL, "unknown-%d",
+ atomic_inc_return(&unknown_domains));
+ if (!domain->name)
+ return -ENOMEM;
domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
}
- fwnode_handle_get(fwnode);
- fwnode_dev_initialized(fwnode, true);
+ return 0;
+}
+
+static struct irq_domain *__irq_domain_create(const struct irq_domain_info *info)
+{
+ struct irq_domain *domain;
+ int err;
+
+ if (WARN_ON((info->size && info->direct_max) ||
+ (!IS_ENABLED(CONFIG_IRQ_DOMAIN_NOMAP) && info->direct_max) ||
+ (info->direct_max && info->direct_max != info->hwirq_max)))
+ return ERR_PTR(-EINVAL);
+
+ domain = kzalloc_node(struct_size(domain, revmap, info->size),
+ GFP_KERNEL, of_node_to_nid(to_of_node(info->fwnode)));
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
+
+ err = irq_domain_set_name(domain, info->fwnode, info->bus_token);
+ if (err) {
+ kfree(domain);
+ return ERR_PTR(err);
+ }
+
+ domain->fwnode = fwnode_handle_get(info->fwnode);
+ fwnode_dev_initialized(domain->fwnode, true);
/* Fill structure */
INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
- domain->ops = ops;
- domain->host_data = host_data;
- domain->hwirq_max = hwirq_max;
+ domain->ops = info->ops;
+ domain->host_data = info->host_data;
+ domain->bus_token = info->bus_token;
+ domain->hwirq_max = info->hwirq_max;
- if (direct_max)
+ if (info->direct_max)
domain->flags |= IRQ_DOMAIN_FLAG_NO_MAP;
- domain->revmap_size = size;
+ domain->revmap_size = info->size;
/*
* Hierarchical domains use the domain lock of the root domain
@@ -240,34 +258,64 @@ static void __irq_domain_publish(struct irq_domain *domain)
pr_debug("Added domain %s\n", domain->name);
}
+static void irq_domain_free(struct irq_domain *domain)
+{
+ fwnode_dev_initialized(domain->fwnode, false);
+ fwnode_handle_put(domain->fwnode);
+ if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED)
+ kfree(domain->name);
+ kfree(domain);
+}
+
/**
- * __irq_domain_add() - Allocate a new irq_domain data structure
- * @fwnode: firmware node for the interrupt controller
- * @size: Size of linear map; 0 for radix mapping only
- * @hwirq_max: Maximum number of interrupts supported by controller
- * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
- * direct mapping
- * @ops: domain callbacks
- * @host_data: Controller private data pointer
+ * irq_domain_instantiate() - Instantiate a new irq domain data structure
+ * @info: Domain information pointer pointing to the information for this domain
*
- * Allocates and initializes an irq_domain structure.
- * Returns pointer to IRQ domain, or NULL on failure.
+ * Return: A pointer to the instantiated irq domain or an ERR_PTR value.
*/
-struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size,
- irq_hw_number_t hwirq_max, int direct_max,
- const struct irq_domain_ops *ops,
- void *host_data)
+struct irq_domain *irq_domain_instantiate(const struct irq_domain_info *info)
{
struct irq_domain *domain;
+ int err;
+
+ domain = __irq_domain_create(info);
+ if (IS_ERR(domain))
+ return domain;
- domain = __irq_domain_create(fwnode, size, hwirq_max, direct_max,
- ops, host_data);
- if (domain)
- __irq_domain_publish(domain);
+ domain->flags |= info->domain_flags;
+ domain->exit = info->exit;
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ if (info->parent) {
+ domain->root = info->parent->root;
+ domain->parent = info->parent;
+ }
+#endif
+
+ if (info->dgc_info) {
+ err = irq_domain_alloc_generic_chips(domain, info->dgc_info);
+ if (err)
+ goto err_domain_free;
+ }
+
+ if (info->init) {
+ err = info->init(domain);
+ if (err)
+ goto err_domain_gc_remove;
+ }
+
+ __irq_domain_publish(domain);
return domain;
+
+err_domain_gc_remove:
+ if (info->dgc_info)
+ irq_domain_remove_generic_chips(domain);
+err_domain_free:
+ irq_domain_free(domain);
+ return ERR_PTR(err);
}
-EXPORT_SYMBOL_GPL(__irq_domain_add);
+EXPORT_SYMBOL_GPL(irq_domain_instantiate);
/**
* irq_domain_remove() - Remove an irq domain.
@@ -279,6 +327,9 @@ EXPORT_SYMBOL_GPL(__irq_domain_add);
*/
void irq_domain_remove(struct irq_domain *domain)
{
+ if (domain->exit)
+ domain->exit(domain);
+
mutex_lock(&irq_domain_mutex);
debugfs_remove_domain_dir(domain);
@@ -294,13 +345,11 @@ void irq_domain_remove(struct irq_domain *domain)
mutex_unlock(&irq_domain_mutex);
- pr_debug("Removed domain %s\n", domain->name);
+ if (domain->flags & IRQ_DOMAIN_FLAG_DESTROY_GC)
+ irq_domain_remove_generic_chips(domain);
- fwnode_dev_initialized(domain->fwnode, false);
- fwnode_handle_put(domain->fwnode);
- if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED)
- kfree(domain->name);
- kfree(domain);
+ pr_debug("Removed domain %s\n", domain->name);
+ irq_domain_free(domain);
}
EXPORT_SYMBOL_GPL(irq_domain_remove);
@@ -360,10 +409,17 @@ struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode,
const struct irq_domain_ops *ops,
void *host_data)
{
+ struct irq_domain_info info = {
+ .fwnode = fwnode,
+ .size = size,
+ .hwirq_max = size,
+ .ops = ops,
+ .host_data = host_data,
+ };
struct irq_domain *domain;
- domain = __irq_domain_add(fwnode, size, size, 0, ops, host_data);
- if (!domain)
+ domain = irq_domain_instantiate(&info);
+ if (IS_ERR(domain))
return NULL;
if (first_irq > 0) {
@@ -416,11 +472,20 @@ struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode,
const struct irq_domain_ops *ops,
void *host_data)
{
+ struct irq_domain_info info = {
+ .fwnode = fwnode,
+ .size = first_hwirq + size,
+ .hwirq_max = first_hwirq + size,
+ .ops = ops,
+ .host_data = host_data,
+ };
struct irq_domain *domain;
- domain = __irq_domain_add(fwnode, first_hwirq + size, first_hwirq + size, 0, ops, host_data);
- if (domain)
- irq_domain_associate_many(domain, first_irq, first_hwirq, size);
+ domain = irq_domain_instantiate(&info);
+ if (IS_ERR(domain))
+ return NULL;
+
+ irq_domain_associate_many(domain, first_irq, first_hwirq, size);
return domain;
}
@@ -438,7 +503,8 @@ struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec,
struct fwnode_handle *fwnode = fwspec->fwnode;
int rc;
- /* We might want to match the legacy controller last since
+ /*
+ * We might want to match the legacy controller last since
* it might potentially be set to match all interrupts in
* the absence of a device node. This isn't a problem so far
* yet though...
@@ -982,6 +1048,12 @@ EXPORT_SYMBOL_GPL(__irq_resolve_mapping);
/**
* irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings
+ * @d: Interrupt domain involved in the translation
+ * @ctrlr: The device tree node for the device whose interrupt is translated
+ * @intspec: The interrupt specifier data from the device tree
+ * @intsize: The number of entries in @intspec
+ * @out_hwirq: Pointer to storage for the hardware interrupt number
+ * @out_type: Pointer to storage for the interrupt type
*
* Device Tree IRQ specifier translation function which works with one cell
* bindings where the cell value maps directly to the hwirq number.
@@ -1000,6 +1072,12 @@ EXPORT_SYMBOL_GPL(irq_domain_xlate_onecell);
/**
* irq_domain_xlate_twocell() - Generic xlate for direct two cell bindings
+ * @d: Interrupt domain involved in the translation
+ * @ctrlr: The device tree node for the device whose interrupt is translated
+ * @intspec: The interrupt specifier data from the device tree
+ * @intsize: The number of entries in @intspec
+ * @out_hwirq: Pointer to storage for the hardware interrupt number
+ * @out_type: Pointer to storage for the interrupt type
*
* Device Tree IRQ specifier translation function which works with two cell
* bindings where the cell values map directly to the hwirq number
@@ -1018,6 +1096,12 @@ EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell);
/**
* irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings
+ * @d: Interrupt domain involved in the translation
+ * @ctrlr: The device tree node for the device whose interrupt is translated
+ * @intspec: The interrupt specifier data from the device tree
+ * @intsize: The number of entries in @intspec
+ * @out_hwirq: Pointer to storage for the hardware interrupt number
+ * @out_type: Pointer to storage for the interrupt type
*
* Device Tree IRQ specifier translation function which works with either one
* or two cell bindings where the cell values map directly to the hwirq number
@@ -1051,6 +1135,10 @@ EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
/**
* irq_domain_translate_onecell() - Generic translate for direct one cell
* bindings
+ * @d: Interrupt domain involved in the translation
+ * @fwspec: The firmware interrupt specifier to translate
+ * @out_hwirq: Pointer to storage for the hardware interrupt number
+ * @out_type: Pointer to storage for the interrupt type
*/
int irq_domain_translate_onecell(struct irq_domain *d,
struct irq_fwspec *fwspec,
@@ -1068,6 +1156,10 @@ EXPORT_SYMBOL_GPL(irq_domain_translate_onecell);
/**
* irq_domain_translate_twocell() - Generic translate for direct two cell
* bindings
+ * @d: Interrupt domain involved in the translation
+ * @fwspec: The firmware interrupt specifier to translate
+ * @out_hwirq: Pointer to storage for the hardware interrupt number
+ * @out_type: Pointer to storage for the interrupt type
*
* Device Tree IRQ specifier translation function which works with two cell
* bindings where the cell values map directly to the hwirq number
@@ -1144,23 +1236,22 @@ struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent,
const struct irq_domain_ops *ops,
void *host_data)
{
- struct irq_domain *domain;
-
- if (size)
- domain = __irq_domain_create(fwnode, size, size, 0, ops, host_data);
- else
- domain = __irq_domain_create(fwnode, 0, ~0, 0, ops, host_data);
-
- if (domain) {
- if (parent)
- domain->root = parent->root;
- domain->parent = parent;
- domain->flags |= flags;
+ struct irq_domain_info info = {
+ .fwnode = fwnode,
+ .size = size,
+ .hwirq_max = size,
+ .ops = ops,
+ .host_data = host_data,
+ .domain_flags = flags,
+ .parent = parent,
+ };
+ struct irq_domain *d;
- __irq_domain_publish(domain);
- }
+ if (!info.size)
+ info.hwirq_max = ~0U;
- return domain;
+ d = irq_domain_instantiate(&info);
+ return IS_ERR(d) ? NULL : d;
}
EXPORT_SYMBOL_GPL(irq_domain_create_hierarchy);
@@ -1932,13 +2023,26 @@ static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq
static struct dentry *domain_dir;
-static void
-irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind)
+static const struct irq_bit_descr irqdomain_flags[] = {
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_HIERARCHY),
+ BIT_MASK_DESCR(IRQ_DOMAIN_NAME_ALLOCATED),
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_IPI_PER_CPU),
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_IPI_SINGLE),
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_MSI),
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_ISOLATED_MSI),
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_NO_MAP),
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_MSI_PARENT),
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_MSI_DEVICE),
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_NONCORE),
+};
+
+static void irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind)
{
seq_printf(m, "%*sname: %s\n", ind, "", d->name);
seq_printf(m, "%*ssize: %u\n", ind + 1, "", d->revmap_size);
seq_printf(m, "%*smapped: %u\n", ind + 1, "", d->mapcount);
seq_printf(m, "%*sflags: 0x%08x\n", ind +1 , "", d->flags);
+ irq_debug_show_bits(m, ind, d->flags, irqdomain_flags, ARRAY_SIZE(irqdomain_flags));
if (d->ops && d->ops->debug_show)
d->ops->debug_show(m, d, NULL, ind + 1);
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 71b0fc2d0aea..dd53298ef1a5 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1337,7 +1337,7 @@ static int irq_thread(void *data)
* synchronize_hardirq(). So neither IRQTF_RUNTHREAD nor the
* oneshot mask bit can be set.
*/
- task_work_cancel(current, irq_thread_dtor);
+ task_work_cancel_func(current, irq_thread_dtor);
return 0;
}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 2024f89baea4..5fa0547ece0c 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -8,18 +8,34 @@
* This file contains common code to support Message Signaled Interrupts for
* PCI compatible and non PCI compatible devices.
*/
-#include <linux/types.h>
#include <linux/device.h>
#include <linux/irq.h>
#include <linux/irqdomain.h>
#include <linux/msi.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
#include <linux/slab.h>
#include <linux/sysfs.h>
-#include <linux/pci.h>
+#include <linux/types.h>
+#include <linux/xarray.h>
#include "internals.h"
/**
+ * struct msi_device_data - MSI per device data
+ * @properties: MSI properties which are interesting to drivers
+ * @mutex: Mutex protecting the MSI descriptor store
+ * @__domains: Internal data for per device MSI domains
+ * @__iter_idx: Index to search the next entry for iterators
+ */
+struct msi_device_data {
+ unsigned long properties;
+ struct mutex mutex;
+ struct msi_dev_domain __domains[MSI_MAX_DEVICE_IRQDOMAINS];
+ unsigned long __iter_idx;
+};
+
+/**
* struct msi_ctrl - MSI internal management control structure
* @domid: ID of the domain on which management operations should be done
* @first: First (hardware) slot index to operate on
@@ -1088,8 +1104,8 @@ bool msi_match_device_irq_domain(struct device *dev, unsigned int domid,
return ret;
}
-int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
- int nvec, msi_alloc_info_t *arg)
+static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
+ int nvec, msi_alloc_info_t *arg)
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
@@ -1097,77 +1113,6 @@ int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
return ops->msi_prepare(domain, dev, nvec, arg);
}
-int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
- int virq_base, int nvec, msi_alloc_info_t *arg)
-{
- struct msi_domain_info *info = domain->host_data;
- struct msi_domain_ops *ops = info->ops;
- struct msi_ctrl ctrl = {
- .domid = MSI_DEFAULT_DOMAIN,
- .first = virq_base,
- .last = virq_base + nvec - 1,
- };
- struct msi_desc *desc;
- struct xarray *xa;
- int ret, virq;
-
- msi_lock_descs(dev);
-
- if (!msi_ctrl_valid(dev, &ctrl)) {
- ret = -EINVAL;
- goto unlock;
- }
-
- ret = msi_domain_add_simple_msi_descs(dev, &ctrl);
- if (ret)
- goto unlock;
-
- xa = &dev->msi.data->__domains[ctrl.domid].store;
-
- for (virq = virq_base; virq < virq_base + nvec; virq++) {
- desc = xa_load(xa, virq);
- desc->irq = virq;
-
- ops->set_desc(arg, desc);
- ret = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);
- if (ret)
- goto fail;
-
- irq_set_msi_desc(virq, desc);
- }
- msi_unlock_descs(dev);
- return 0;
-
-fail:
- for (--virq; virq >= virq_base; virq--) {
- msi_domain_depopulate_descs(dev, virq, 1);
- irq_domain_free_irqs_common(domain, virq, 1);
- }
- msi_domain_free_descs(dev, &ctrl);
-unlock:
- msi_unlock_descs(dev);
- return ret;
-}
-
-void msi_domain_depopulate_descs(struct device *dev, int virq_base, int nvec)
-{
- struct msi_ctrl ctrl = {
- .domid = MSI_DEFAULT_DOMAIN,
- .first = virq_base,
- .last = virq_base + nvec - 1,
- };
- struct msi_desc *desc;
- struct xarray *xa;
- unsigned long idx;
-
- if (!msi_ctrl_valid(dev, &ctrl))
- return;
-
- xa = &dev->msi.data->__domains[ctrl.domid].store;
- xa_for_each_range(xa, idx, desc, ctrl.first, ctrl.last)
- desc->irq = 0;
-}
-
/*
* Carefully check whether the device can use reservation mode. If
* reservation mode is enabled then the early activation will assign a
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 5c320c3f10a7..8cccdf40725a 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -461,10 +461,10 @@ int show_interrupts(struct seq_file *p, void *v)
{
static int prec;
- unsigned long flags, any_count = 0;
int i = *(loff_t *) v, j;
struct irqaction *action;
struct irq_desc *desc;
+ unsigned long flags;
if (i > ACTUAL_NR_IRQS)
return 0;
@@ -488,10 +488,7 @@ int show_interrupts(struct seq_file *p, void *v)
if (!desc || irq_settings_is_hidden(desc))
goto outsparse;
- if (desc->kstat_irqs)
- any_count = kstat_irqs_desc(desc, cpu_online_mask);
-
- if ((!desc->action || irq_desc_is_chained(desc)) && !any_count)
+ if (!desc->action || irq_desc_is_chained(desc) || !desc->kstat_irqs)
goto outsparse;
seq_printf(p, "%*d: ", prec, i);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 3218fa5688b9..4ad5ed8adf96 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -131,13 +131,16 @@ bool static_key_fast_inc_not_disabled(struct static_key *key)
STATIC_KEY_CHECK_USE(key);
/*
* Negative key->enabled has a special meaning: it sends
- * static_key_slow_inc() down the slow path, and it is non-zero
- * so it counts as "enabled" in jump_label_update(). Note that
- * atomic_inc_unless_negative() checks >= 0, so roll our own.
+ * static_key_slow_inc/dec() down the slow path, and it is non-zero
+ * so it counts as "enabled" in jump_label_update().
+ *
+ * The INT_MAX overflow condition is either used by the networking
+ * code to reset or detected in the slow path of
+ * static_key_slow_inc_cpuslocked().
*/
v = atomic_read(&key->enabled);
do {
- if (v <= 0 || (v + 1) < 0)
+ if (v <= 0 || v == INT_MAX)
return false;
} while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1)));
@@ -150,7 +153,7 @@ bool static_key_slow_inc_cpuslocked(struct static_key *key)
lockdep_assert_cpus_held();
/*
- * Careful if we get concurrent static_key_slow_inc() calls;
+ * Careful if we get concurrent static_key_slow_inc/dec() calls;
* later calls must wait for the first one to _finish_ the
* jump_label_update() process. At the same time, however,
* the jump_label_update() call below wants to see
@@ -159,22 +162,24 @@ bool static_key_slow_inc_cpuslocked(struct static_key *key)
if (static_key_fast_inc_not_disabled(key))
return true;
- jump_label_lock();
- if (atomic_read(&key->enabled) == 0) {
- atomic_set(&key->enabled, -1);
+ guard(mutex)(&jump_label_mutex);
+ /* Try to mark it as 'enabling in progress. */
+ if (!atomic_cmpxchg(&key->enabled, 0, -1)) {
jump_label_update(key);
/*
- * Ensure that if the above cmpxchg loop observes our positive
- * value, it must also observe all the text changes.
+ * Ensure that when static_key_fast_inc_not_disabled() or
+ * static_key_slow_try_dec() observe the positive value,
+ * they must also observe all the text changes.
*/
atomic_set_release(&key->enabled, 1);
} else {
- if (WARN_ON_ONCE(!static_key_fast_inc_not_disabled(key))) {
- jump_label_unlock();
+ /*
+ * While holding the mutex this should never observe
+ * anything else than a value >= 1 and succeed
+ */
+ if (WARN_ON_ONCE(!static_key_fast_inc_not_disabled(key)))
return false;
- }
}
- jump_label_unlock();
return true;
}
@@ -247,20 +252,32 @@ EXPORT_SYMBOL_GPL(static_key_disable);
static bool static_key_slow_try_dec(struct static_key *key)
{
- int val;
-
- val = atomic_fetch_add_unless(&key->enabled, -1, 1);
- if (val == 1)
- return false;
+ int v;
/*
- * The negative count check is valid even when a negative
- * key->enabled is in use by static_key_slow_inc(); a
- * __static_key_slow_dec() before the first static_key_slow_inc()
- * returns is unbalanced, because all other static_key_slow_inc()
- * instances block while the update is in progress.
+ * Go into the slow path if key::enabled is less than or equal than
+ * one. One is valid to shut down the key, anything less than one
+ * is an imbalance, which is handled at the call site.
+ *
+ * That includes the special case of '-1' which is set in
+ * static_key_slow_inc_cpuslocked(), but that's harmless as it is
+ * fully serialized in the slow path below. By the time this task
+ * acquires the jump label lock the value is back to one and the
+ * retry under the lock must succeed.
*/
- WARN(val < 0, "jump label: negative count!\n");
+ v = atomic_read(&key->enabled);
+ do {
+ /*
+ * Warn about the '-1' case though; since that means a
+ * decrement is concurrent with a first (0->1) increment. IOW
+ * people are trying to disable something that wasn't yet fully
+ * enabled. This suggests an ordering problem on the user side.
+ */
+ WARN_ON_ONCE(v < 0);
+ if (v <= 1)
+ return false;
+ } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v - 1)));
+
return true;
}
@@ -271,10 +288,11 @@ static void __static_key_slow_dec_cpuslocked(struct static_key *key)
if (static_key_slow_try_dec(key))
return;
- jump_label_lock();
- if (atomic_dec_and_test(&key->enabled))
+ guard(mutex)(&jump_label_mutex);
+ if (atomic_cmpxchg(&key->enabled, 1, 0))
jump_label_update(key);
- jump_label_unlock();
+ else
+ WARN_ON_ONCE(!static_key_slow_try_dec(key));
}
static void __static_key_slow_dec(struct static_key *key)
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 98b9622d372e..fb2c77368d18 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -148,9 +148,6 @@ static unsigned int get_symbol_offset(unsigned long pos)
unsigned long kallsyms_sym_address(int idx)
{
- if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE))
- return kallsyms_addresses[idx];
-
/* values are unsigned offsets if --absolute-percpu is not in effect */
if (!IS_ENABLED(CONFIG_KALLSYMS_ABSOLUTE_PERCPU))
return kallsyms_relative_base + (u32)kallsyms_offsets[idx];
@@ -325,7 +322,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
unsigned long symbol_start = 0, symbol_end = 0;
unsigned long i, low, high, mid;
- /* Do a binary search on the sorted kallsyms_addresses array. */
+ /* Do a binary search on the sorted kallsyms_offsets array. */
low = 0;
high = kallsyms_num_syms;
diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h
index 85480274fc8f..9633782f8250 100644
--- a/kernel/kallsyms_internal.h
+++ b/kernel/kallsyms_internal.h
@@ -4,12 +4,6 @@
#include <linux/types.h>
-/*
- * These will be re-linked against their real values during the second link
- * stage. Preliminary values must be provided in the linker script using the
- * PROVIDE() directive so that the first link stage can complete successfully.
- */
-extern const unsigned long kallsyms_addresses[];
extern const int kallsyms_offsets[];
extern const u8 kallsyms_names[];
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index 0c17b4c83e1c..117d9d4d3c3b 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -1620,5 +1620,6 @@ static struct kunit_suite kcsan_test_suite = {
kunit_test_suites(&kcsan_test_suite);
+MODULE_DESCRIPTION("KCSAN test suite");
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Marco Elver <elver@google.com>");
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 9112d69d68b0..c0caa14880c3 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -888,7 +888,7 @@ struct kimage *kexec_crash_image;
static int kexec_load_disabled;
#ifdef CONFIG_SYSCTL
-static int kexec_limit_handler(struct ctl_table *table, int write,
+static int kexec_limit_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct kexec_load_limit *limit = table->data;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 6a76a8100073..e85de37d9e1e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -939,7 +939,7 @@ static void unoptimize_all_kprobes(void)
static DEFINE_MUTEX(kprobe_sysctl_mutex);
static int sysctl_kprobes_optimization;
-static int proc_kprobes_optimization_handler(struct ctl_table *table,
+static int proc_kprobes_optimization_handler(const struct ctl_table *table,
int write, void *buffer,
size_t *length, loff_t *ppos)
{
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 84c53285f499..7a75eab9c179 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -65,7 +65,7 @@ static struct latency_record latency_record[MAXLR];
int latencytop_enabled;
#ifdef CONFIG_SYSCTL
-static int sysctl_latencytop(struct ctl_table *table, int write, void *buffer,
+static int sysctl_latencytop(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int err;
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 52426665eecc..3c21c31796db 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -346,6 +346,7 @@ int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
* /sys/kernel/livepatch/<patch>/enabled
* /sys/kernel/livepatch/<patch>/transition
* /sys/kernel/livepatch/<patch>/force
+ * /sys/kernel/livepatch/<patch>/replace
* /sys/kernel/livepatch/<patch>/<object>
* /sys/kernel/livepatch/<patch>/<object>/patched
* /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
@@ -401,7 +402,7 @@ static ssize_t enabled_show(struct kobject *kobj,
struct klp_patch *patch;
patch = container_of(kobj, struct klp_patch, kobj);
- return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->enabled);
+ return sysfs_emit(buf, "%d\n", patch->enabled);
}
static ssize_t transition_show(struct kobject *kobj,
@@ -410,8 +411,7 @@ static ssize_t transition_show(struct kobject *kobj,
struct klp_patch *patch;
patch = container_of(kobj, struct klp_patch, kobj);
- return snprintf(buf, PAGE_SIZE-1, "%d\n",
- patch == klp_transition_patch);
+ return sysfs_emit(buf, "%d\n", patch == klp_transition_patch);
}
static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -443,13 +443,24 @@ static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr,
return count;
}
+static ssize_t replace_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct klp_patch *patch;
+
+ patch = container_of(kobj, struct klp_patch, kobj);
+ return sysfs_emit(buf, "%d\n", patch->replace);
+}
+
static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition);
static struct kobj_attribute force_kobj_attr = __ATTR_WO(force);
+static struct kobj_attribute replace_kobj_attr = __ATTR_RO(replace);
static struct attribute *klp_patch_attrs[] = {
&enabled_kobj_attr.attr,
&transition_kobj_attr.attr,
&force_kobj_attr.attr,
+ &replace_kobj_attr.attr,
NULL
};
ATTRIBUTE_GROUPS(klp_patch);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 151bd3de5936..58c88220a478 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -97,7 +97,6 @@ static struct ctl_table kern_lockdep_table[] = {
.proc_handler = proc_dointvec,
},
#endif /* CONFIG_LOCK_STAT */
- { }
};
static __init int kernel_lockdep_sysctls_init(void)
@@ -4918,6 +4917,9 @@ EXPORT_SYMBOL_GPL(lockdep_init_map_type);
struct lock_class_key __lockdep_no_validate__;
EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
+struct lock_class_key __lockdep_no_track__;
+EXPORT_SYMBOL_GPL(__lockdep_no_track__);
+
#ifdef CONFIG_PROVE_LOCKING
void lockdep_set_lock_cmp_fn(struct lockdep_map *lock, lock_cmp_fn cmp_fn,
lock_print_fn print_fn)
@@ -5002,6 +5004,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (unlikely(!debug_locks))
return 0;
+ if (unlikely(lock->key == &__lockdep_no_track__))
+ return 0;
+
if (!prove_locking || lock->key == &__lockdep_no_validate__)
check = 0;
@@ -5764,7 +5769,8 @@ void lock_release(struct lockdep_map *lock, unsigned long ip)
trace_lock_release(lock, ip);
- if (unlikely(!lockdep_enabled()))
+ if (unlikely(!lockdep_enabled() ||
+ lock->key == &__lockdep_no_track__))
return;
raw_local_irq_save(flags);
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 415d81e6ce70..de95ec07e477 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -30,6 +30,7 @@
#include <linux/torture.h>
#include <linux/reboot.h>
+MODULE_DESCRIPTION("torture test facility for locking");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 1df5fef8a656..7d96bed718e4 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -583,7 +583,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
#include "qspinlock_paravirt.h"
#include "qspinlock.c"
-bool nopvspin __initdata;
+bool nopvspin;
static __init int parse_nopvspin(char *arg)
{
nopvspin = true;
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index c6d17aee4209..33cac79e3994 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1297,7 +1297,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
/*
* lock for writing
*/
-static inline int __down_write_common(struct rw_semaphore *sem, int state)
+static __always_inline int __down_write_common(struct rw_semaphore *sem, int state)
{
int ret = 0;
@@ -1310,12 +1310,12 @@ static inline int __down_write_common(struct rw_semaphore *sem, int state)
return ret;
}
-static inline void __down_write(struct rw_semaphore *sem)
+static __always_inline void __down_write(struct rw_semaphore *sem)
{
__down_write_common(sem, TASK_UNINTERRUPTIBLE);
}
-static inline int __down_write_killable(struct rw_semaphore *sem)
+static __always_inline int __down_write_killable(struct rw_semaphore *sem)
{
return __down_write_common(sem, TASK_KILLABLE);
}
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 8475a0794f8c..438c6086d540 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -413,3 +413,11 @@ notrace int in_lock_functions(unsigned long addr)
&& addr < (unsigned long)__lock_text_end;
}
EXPORT_SYMBOL(in_lock_functions);
+
+#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_PREEMPT_RT)
+void notrace lockdep_assert_in_softirq_func(void)
+{
+ lockdep_assert_in_softirq();
+}
+EXPORT_SYMBOL(lockdep_assert_in_softirq_func);
+#endif
diff --git a/kernel/module/main.c b/kernel/module/main.c
index d18a94b973e1..d9592195c5bb 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2166,6 +2166,8 @@ static int find_module_sections(struct module *mod, struct load_info *info)
#endif
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
mod->btf_data = any_section_objs(info, ".BTF", 1, &mod->btf_data_size);
+ mod->btf_base_data = any_section_objs(info, ".BTF.base", 1,
+ &mod->btf_base_data_size);
#endif
#ifdef CONFIG_JUMP_LABEL
mod->jump_entries = section_objs(info, "__jump_table",
@@ -2590,8 +2592,9 @@ static noinline int do_init_module(struct module *mod)
}
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
- /* .BTF is not SHF_ALLOC and will get removed, so sanitize pointer */
+ /* .BTF is not SHF_ALLOC and will get removed, so sanitize pointers */
mod->btf_data = NULL;
+ mod->btf_base_data = NULL;
#endif
/*
* We want to free module_init, but be aware that kallsyms may be
diff --git a/kernel/panic.c b/kernel/panic.c
index 8bff183d6180..f861bedc1925 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -35,6 +35,7 @@
#include <linux/debugfs.h>
#include <linux/sysfs.h>
#include <linux/context_tracking.h>
+#include <linux/seq_buf.h>
#include <trace/events/error_report.h>
#include <asm/sections.h>
@@ -470,32 +471,83 @@ void panic(const char *fmt, ...)
EXPORT_SYMBOL(panic);
+#define TAINT_FLAG(taint, _c_true, _c_false, _module) \
+ [ TAINT_##taint ] = { \
+ .c_true = _c_true, .c_false = _c_false, \
+ .module = _module, \
+ .desc = #taint, \
+ }
+
/*
* TAINT_FORCED_RMMOD could be a per-module flag but the module
* is being removed anyway.
*/
const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
- [ TAINT_PROPRIETARY_MODULE ] = { 'P', 'G', true },
- [ TAINT_FORCED_MODULE ] = { 'F', ' ', true },
- [ TAINT_CPU_OUT_OF_SPEC ] = { 'S', ' ', false },
- [ TAINT_FORCED_RMMOD ] = { 'R', ' ', false },
- [ TAINT_MACHINE_CHECK ] = { 'M', ' ', false },
- [ TAINT_BAD_PAGE ] = { 'B', ' ', false },
- [ TAINT_USER ] = { 'U', ' ', false },
- [ TAINT_DIE ] = { 'D', ' ', false },
- [ TAINT_OVERRIDDEN_ACPI_TABLE ] = { 'A', ' ', false },
- [ TAINT_WARN ] = { 'W', ' ', false },
- [ TAINT_CRAP ] = { 'C', ' ', true },
- [ TAINT_FIRMWARE_WORKAROUND ] = { 'I', ' ', false },
- [ TAINT_OOT_MODULE ] = { 'O', ' ', true },
- [ TAINT_UNSIGNED_MODULE ] = { 'E', ' ', true },
- [ TAINT_SOFTLOCKUP ] = { 'L', ' ', false },
- [ TAINT_LIVEPATCH ] = { 'K', ' ', true },
- [ TAINT_AUX ] = { 'X', ' ', true },
- [ TAINT_RANDSTRUCT ] = { 'T', ' ', true },
- [ TAINT_TEST ] = { 'N', ' ', true },
+ TAINT_FLAG(PROPRIETARY_MODULE, 'P', 'G', true),
+ TAINT_FLAG(FORCED_MODULE, 'F', ' ', true),
+ TAINT_FLAG(CPU_OUT_OF_SPEC, 'S', ' ', false),
+ TAINT_FLAG(FORCED_RMMOD, 'R', ' ', false),
+ TAINT_FLAG(MACHINE_CHECK, 'M', ' ', false),
+ TAINT_FLAG(BAD_PAGE, 'B', ' ', false),
+ TAINT_FLAG(USER, 'U', ' ', false),
+ TAINT_FLAG(DIE, 'D', ' ', false),
+ TAINT_FLAG(OVERRIDDEN_ACPI_TABLE, 'A', ' ', false),
+ TAINT_FLAG(WARN, 'W', ' ', false),
+ TAINT_FLAG(CRAP, 'C', ' ', true),
+ TAINT_FLAG(FIRMWARE_WORKAROUND, 'I', ' ', false),
+ TAINT_FLAG(OOT_MODULE, 'O', ' ', true),
+ TAINT_FLAG(UNSIGNED_MODULE, 'E', ' ', true),
+ TAINT_FLAG(SOFTLOCKUP, 'L', ' ', false),
+ TAINT_FLAG(LIVEPATCH, 'K', ' ', true),
+ TAINT_FLAG(AUX, 'X', ' ', true),
+ TAINT_FLAG(RANDSTRUCT, 'T', ' ', true),
+ TAINT_FLAG(TEST, 'N', ' ', true),
};
+#undef TAINT_FLAG
+
+static void print_tainted_seq(struct seq_buf *s, bool verbose)
+{
+ const char *sep = "";
+ int i;
+
+ if (!tainted_mask) {
+ seq_buf_puts(s, "Not tainted");
+ return;
+ }
+
+ seq_buf_printf(s, "Tainted: ");
+ for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
+ const struct taint_flag *t = &taint_flags[i];
+ bool is_set = test_bit(i, &tainted_mask);
+ char c = is_set ? t->c_true : t->c_false;
+
+ if (verbose) {
+ if (is_set) {
+ seq_buf_printf(s, "%s[%c]=%s", sep, c, t->desc);
+ sep = ", ";
+ }
+ } else {
+ seq_buf_putc(s, c);
+ }
+ }
+}
+
+static const char *_print_tainted(bool verbose)
+{
+ /* FIXME: what should the size be? */
+ static char buf[sizeof(taint_flags)];
+ struct seq_buf s;
+
+ BUILD_BUG_ON(ARRAY_SIZE(taint_flags) != TAINT_FLAGS_COUNT);
+
+ seq_buf_init(&s, buf, sizeof(buf));
+
+ print_tainted_seq(&s, verbose);
+
+ return seq_buf_str(&s);
+}
+
/**
* print_tainted - return a string to represent the kernel taint state.
*
@@ -506,25 +558,15 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
*/
const char *print_tainted(void)
{
- static char buf[TAINT_FLAGS_COUNT + sizeof("Tainted: ")];
-
- BUILD_BUG_ON(ARRAY_SIZE(taint_flags) != TAINT_FLAGS_COUNT);
-
- if (tainted_mask) {
- char *s;
- int i;
-
- s = buf + sprintf(buf, "Tainted: ");
- for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
- const struct taint_flag *t = &taint_flags[i];
- *s++ = test_bit(i, &tainted_mask) ?
- t->c_true : t->c_false;
- }
- *s = 0;
- } else
- snprintf(buf, sizeof(buf), "Not tainted");
+ return _print_tainted(false);
+}
- return buf;
+/**
+ * print_tainted_verbose - A more verbose version of print_tainted()
+ */
+const char *print_tainted_verbose(void)
+{
+ return _print_tainted(true);
}
int test_taint(unsigned flag)
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 25f3cf679b35..d70ab49d5b4a 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -249,24 +249,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
set_current_state(TASK_INTERRUPTIBLE);
if (pid_ns->pid_allocated == init_pids)
break;
- /*
- * Release tasks_rcu_exit_srcu to avoid following deadlock:
- *
- * 1) TASK A unshare(CLONE_NEWPID)
- * 2) TASK A fork() twice -> TASK B (child reaper for new ns)
- * and TASK C
- * 3) TASK B exits, kills TASK C, waits for TASK A to reap it
- * 4) TASK A calls synchronize_rcu_tasks()
- * -> synchronize_srcu(tasks_rcu_exit_srcu)
- * 5) *DEADLOCK*
- *
- * It is considered safe to release tasks_rcu_exit_srcu here
- * because we assume the current task can not be concurrently
- * reaped at this point.
- */
- exit_tasks_rcu_stop();
schedule();
- exit_tasks_rcu_start();
}
__set_current_state(TASK_RUNNING);
@@ -278,7 +261,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
}
#ifdef CONFIG_CHECKPOINT_RESTORE
-static int pid_ns_ctl_handler(struct ctl_table *table, int write,
+static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct pid_namespace *pid_ns = task_active_pid_ns(current);
diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h
index fe9fb991dc42..18ecaef6be41 100644
--- a/kernel/pid_sysctl.h
+++ b/kernel/pid_sysctl.h
@@ -5,7 +5,7 @@
#include <linux/pid_namespace.h>
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
-static int pid_mfd_noexec_dointvec_minmax(struct ctl_table *table,
+static int pid_mfd_noexec_dointvec_minmax(const struct ctl_table *table,
int write, void *buf, size_t *lenp, loff_t *ppos)
{
struct pid_namespace *ns = task_active_pid_ns(current);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 753b8dd42a59..82b884b67152 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -200,12 +200,11 @@ void free_all_swap_pages(int swap)
while ((node = swsusp_extents.rb_node)) {
struct swsusp_extent *ext;
- unsigned long offset;
ext = rb_entry(node, struct swsusp_extent, node);
rb_erase(node, &swsusp_extents);
- for (offset = ext->start; offset <= ext->end; offset++)
- swap_free(swp_entry(swap, offset));
+ swap_free_nr(swp_entry(swap, ext->start),
+ ext->end - ext->start + 1);
kfree(ext);
}
diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h
index 3ca74ad391d6..0ab573b6d4dc 100644
--- a/kernel/printk/console_cmdline.h
+++ b/kernel/printk/console_cmdline.h
@@ -6,6 +6,7 @@ struct console_cmdline
{
char name[16]; /* Name of the driver */
int index; /* Minor dev. to use */
+ char devname[32]; /* DEVNAME:0.0 style device name */
bool user_specified; /* Specified by command line vs. platform */
char *options; /* Options for the driver */
#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 6c2afee5ef62..19dcc5832651 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -8,7 +8,7 @@
#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
void __init printk_sysctl_init(void);
-int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
+int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos);
#else
#define printk_sysctl_init() do { } while (0)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index dddb15f48d59..054c0e7784fd 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -197,7 +197,7 @@ __setup("printk.devkmsg=", control_devkmsg);
char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit";
#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
-int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
+int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
char old_str[DEVKMSG_STR_MAX_SIZE];
@@ -2429,18 +2429,23 @@ static void set_user_specified(struct console_cmdline *c, bool user_specified)
console_set_on_cmdline = 1;
}
-static int __add_preferred_console(const char *name, const short idx, char *options,
+static int __add_preferred_console(const char *name, const short idx,
+ const char *devname, char *options,
char *brl_options, bool user_specified)
{
struct console_cmdline *c;
int i;
+ if (!name && !devname)
+ return -EINVAL;
+
/*
* We use a signed short index for struct console for device drivers to
* indicate a not yet assigned index or port. However, a negative index
- * value is not valid for preferred console.
+ * value is not valid when the console name and index are defined on
+ * the command line.
*/
- if (idx < 0)
+ if (name && idx < 0)
return -EINVAL;
/*
@@ -2448,9 +2453,10 @@ static int __add_preferred_console(const char *name, const short idx, char *opti
* if we have a slot free.
*/
for (i = 0, c = console_cmdline;
- i < MAX_CMDLINECONSOLES && c->name[0];
+ i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]);
i++, c++) {
- if (strcmp(c->name, name) == 0 && c->index == idx) {
+ if ((name && strcmp(c->name, name) == 0 && c->index == idx) ||
+ (devname && strcmp(c->devname, devname) == 0)) {
if (!brl_options)
preferred_console = i;
set_user_specified(c, user_specified);
@@ -2461,7 +2467,10 @@ static int __add_preferred_console(const char *name, const short idx, char *opti
return -E2BIG;
if (!brl_options)
preferred_console = i;
- strscpy(c->name, name, sizeof(c->name));
+ if (name)
+ strscpy(c->name, name);
+ if (devname)
+ strscpy(c->devname, devname);
c->options = options;
set_user_specified(c, user_specified);
braille_set_options(c, brl_options);
@@ -2486,8 +2495,13 @@ __setup("console_msg_format=", console_msg_format_setup);
*/
static int __init console_setup(char *str)
{
- char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */
- char *s, *options, *brl_options = NULL;
+ static_assert(sizeof(console_cmdline[0].devname) >= sizeof(console_cmdline[0].name) + 4);
+ char buf[sizeof(console_cmdline[0].devname)];
+ char *brl_options = NULL;
+ char *ttyname = NULL;
+ char *devname = NULL;
+ char *options;
+ char *s;
int idx;
/*
@@ -2496,17 +2510,23 @@ static int __init console_setup(char *str)
* for exactly this purpose.
*/
if (str[0] == 0 || strcmp(str, "null") == 0) {
- __add_preferred_console("ttynull", 0, NULL, NULL, true);
+ __add_preferred_console("ttynull", 0, NULL, NULL, NULL, true);
return 1;
}
if (_braille_console_setup(&str, &brl_options))
return 1;
+ /* For a DEVNAME:0.0 style console the character device is unknown early */
+ if (strchr(str, ':'))
+ devname = buf;
+ else
+ ttyname = buf;
+
/*
* Decode str into name, index, options.
*/
- if (isdigit(str[0]))
+ if (ttyname && isdigit(str[0]))
scnprintf(buf, sizeof(buf), "ttyS%s", str);
else
strscpy(buf, str);
@@ -2523,12 +2543,18 @@ static int __init console_setup(char *str)
#endif
for (s = buf; *s; s++)
- if (isdigit(*s) || *s == ',')
+ if ((ttyname && isdigit(*s)) || *s == ',')
break;
- idx = simple_strtoul(s, NULL, 10);
+
+ /* @idx will get defined when devname matches. */
+ if (devname)
+ idx = -1;
+ else
+ idx = simple_strtoul(s, NULL, 10);
+
*s = 0;
- __add_preferred_console(buf, idx, options, brl_options, true);
+ __add_preferred_console(ttyname, idx, devname, options, brl_options, true);
return 1;
}
__setup("console=", console_setup);
@@ -2548,7 +2574,51 @@ __setup("console=", console_setup);
*/
int add_preferred_console(const char *name, const short idx, char *options)
{
- return __add_preferred_console(name, idx, options, NULL, false);
+ return __add_preferred_console(name, idx, NULL, options, NULL, false);
+}
+
+/**
+ * match_devname_and_update_preferred_console - Update a preferred console
+ * when matching devname is found.
+ * @devname: DEVNAME:0.0 style device name
+ * @name: Name of the corresponding console driver, e.g. "ttyS"
+ * @idx: Console index, e.g. port number.
+ *
+ * The function checks whether a device with the given @devname is
+ * preferred via the console=DEVNAME:0.0 command line option.
+ * It fills the missing console driver name and console index
+ * so that a later register_console() call could find (match)
+ * and enable this device.
+ *
+ * It might be used when a driver subsystem initializes particular
+ * devices with already known DEVNAME:0.0 style names. And it
+ * could predict which console driver name and index this device
+ * would later get associated with.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int match_devname_and_update_preferred_console(const char *devname,
+ const char *name,
+ const short idx)
+{
+ struct console_cmdline *c = console_cmdline;
+ int i;
+
+ if (!devname || !strlen(devname) || !name || !strlen(name) || idx < 0)
+ return -EINVAL;
+
+ for (i = 0; i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]);
+ i++, c++) {
+ if (!strcmp(devname, c->devname)) {
+ pr_info("associate the preferred console \"%s\" with \"%s%d\"\n",
+ devname, name, idx);
+ strscpy(c->name, name);
+ c->index = idx;
+ return 0;
+ }
+ }
+
+ return -ENOENT;
}
bool console_suspend_enabled = true;
@@ -3318,8 +3388,11 @@ static int try_enable_preferred_console(struct console *newcon,
int i, err;
for (i = 0, c = console_cmdline;
- i < MAX_CMDLINECONSOLES && c->name[0];
+ i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]);
i++, c++) {
+ /* Console not yet initialized? */
+ if (!c->name[0])
+ continue;
if (c->user_specified != user_specified)
continue;
if (!newcon->match ||
@@ -4299,15 +4372,15 @@ void kmsg_dump_rewind(struct kmsg_dump_iter *iter)
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
/**
- * console_replay_all - replay kernel log on consoles
+ * console_try_replay_all - try to replay kernel log on consoles
*
* Try to obtain lock on console subsystem and replay all
* available records in printk buffer on the consoles.
* Does nothing if lock is not obtained.
*
- * Context: Any context.
+ * Context: Any, except for NMI.
*/
-void console_replay_all(void)
+void console_try_replay_all(void)
{
if (console_trylock()) {
__console_rewind_all();
diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c
index 3e47dedce9e5..f5072dc85f7a 100644
--- a/kernel/printk/sysctl.c
+++ b/kernel/printk/sysctl.c
@@ -11,7 +11,7 @@
static const int ten_thousand = 10000;
-static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
+static int proc_dointvec_minmax_sysadmin(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
if (write && !capable(CAP_SYS_ADMIN))
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 8db4fedaaa1e..b53a9e8f5904 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -42,6 +42,7 @@
#include "rcu.h"
+MODULE_DESCRIPTION("Read-Copy Update module-based scalability-test facility");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 807fbf6123a7..08bf7c669dd3 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -51,6 +51,7 @@
#include "rcu.h"
+MODULE_DESCRIPTION("Read-Copy Update module-based torture test facility");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
@@ -390,6 +391,7 @@ struct rcu_torture_ops {
int extendables;
int slow_gps;
int no_pi_lock;
+ int debug_objects;
const char *name;
};
@@ -577,6 +579,7 @@ static struct rcu_torture_ops rcu_ops = {
.irq_capable = 1,
.can_boost = IS_ENABLED(CONFIG_RCU_BOOST),
.extendables = RCUTORTURE_MAX_EXTEND,
+ .debug_objects = 1,
.name = "rcu"
};
@@ -747,6 +750,7 @@ static struct rcu_torture_ops srcu_ops = {
.cbflood_max = 50000,
.irq_capable = 1,
.no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
+ .debug_objects = 1,
.name = "srcu"
};
@@ -786,6 +790,7 @@ static struct rcu_torture_ops srcud_ops = {
.cbflood_max = 50000,
.irq_capable = 1,
.no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
+ .debug_objects = 1,
.name = "srcud"
};
@@ -2626,7 +2631,7 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)
spin_lock_irqsave(&rfp->rcu_fwd_lock, flags);
rfcpp = rfp->rcu_fwd_cb_tail;
rfp->rcu_fwd_cb_tail = &rfcp->rfc_next;
- WRITE_ONCE(*rfcpp, rfcp);
+ smp_store_release(rfcpp, rfcp);
WRITE_ONCE(rfp->n_launders_cb, rfp->n_launders_cb + 1);
i = ((jiffies - rfp->rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV));
if (i >= ARRAY_SIZE(rfp->n_launders_hist))
@@ -3455,7 +3460,6 @@ rcu_torture_cleanup(void)
cur_ops->gp_slow_unregister(NULL);
}
-#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
static void rcu_torture_leak_cb(struct rcu_head *rhp)
{
}
@@ -3473,7 +3477,6 @@ static void rcu_torture_err_cb(struct rcu_head *rhp)
*/
pr_alert("%s: duplicated callback was invoked.\n", KBUILD_MODNAME);
}
-#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
/*
* Verify that double-free causes debug-objects to complain, but only
@@ -3482,39 +3485,43 @@ static void rcu_torture_err_cb(struct rcu_head *rhp)
*/
static void rcu_test_debug_objects(void)
{
-#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
struct rcu_head rh1;
struct rcu_head rh2;
+ int idx;
+
+ if (!IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) {
+ pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_%s()\n",
+ KBUILD_MODNAME, cur_ops->name);
+ return;
+ }
+
+ if (WARN_ON_ONCE(cur_ops->debug_objects &&
+ (!cur_ops->call || !cur_ops->cb_barrier)))
+ return;
+
struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
init_rcu_head_on_stack(&rh1);
init_rcu_head_on_stack(&rh2);
- pr_alert("%s: WARN: Duplicate call_rcu() test starting.\n", KBUILD_MODNAME);
+ pr_alert("%s: WARN: Duplicate call_%s() test starting.\n", KBUILD_MODNAME, cur_ops->name);
/* Try to queue the rh2 pair of callbacks for the same grace period. */
- preempt_disable(); /* Prevent preemption from interrupting test. */
- rcu_read_lock(); /* Make it impossible to finish a grace period. */
- call_rcu_hurry(&rh1, rcu_torture_leak_cb); /* Start grace period. */
- local_irq_disable(); /* Make it harder to start a new grace period. */
- call_rcu_hurry(&rh2, rcu_torture_leak_cb);
- call_rcu_hurry(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
+ idx = cur_ops->readlock(); /* Make it impossible to finish a grace period. */
+ cur_ops->call(&rh1, rcu_torture_leak_cb); /* Start grace period. */
+ cur_ops->call(&rh2, rcu_torture_leak_cb);
+ cur_ops->call(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
if (rhp) {
- call_rcu_hurry(rhp, rcu_torture_leak_cb);
- call_rcu_hurry(rhp, rcu_torture_err_cb); /* Another duplicate callback. */
+ cur_ops->call(rhp, rcu_torture_leak_cb);
+ cur_ops->call(rhp, rcu_torture_err_cb); /* Another duplicate callback. */
}
- local_irq_enable();
- rcu_read_unlock();
- preempt_enable();
+ cur_ops->readunlock(idx);
/* Wait for them all to get done so we can safely return. */
- rcu_barrier();
- pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME);
+ cur_ops->cb_barrier();
+ pr_alert("%s: WARN: Duplicate call_%s() test complete.\n", KBUILD_MODNAME, cur_ops->name);
destroy_rcu_head_on_stack(&rh1);
destroy_rcu_head_on_stack(&rh2);
kfree(rhp);
-#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
- pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME);
-#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
}
static void rcutorture_sync(void)
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 2c2648a3ad30..f4ea5b1ec068 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -63,6 +63,7 @@ do { \
#define SCALEOUT_ERRSTRING(s, x...) pr_alert("%s" SCALE_FLAG "!!! " s "\n", scale_type, ## x)
+MODULE_DESCRIPTION("Scalability test for object reference mechanisms");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Joel Fernandes (Google) <joel@joelfernandes.org>");
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 5afd5cf494db..549c03336ee9 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -277,7 +277,8 @@ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
unsigned long cur_s = READ_ONCE(ssp->srcu_idx);
barrier();
- return ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3);
+ return cookie == SRCU_GET_STATE_COMPLETED ||
+ ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3);
}
EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index bc4b58b0204e..b24db425f16d 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -667,7 +667,10 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n",
__func__, ssp, rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)),
rcu_seq_current(&sup->srcu_gp_seq), sup->srcu_gp_seq_needed);
- return; /* Caller forgot to stop doing call_srcu()? */
+ return; // Caller forgot to stop doing call_srcu()?
+ // Or caller invoked start_poll_synchronize_srcu()
+ // and then cleanup_srcu_struct() before that grace
+ // period ended?
}
kfree(sup->node);
sup->node = NULL;
@@ -845,7 +848,6 @@ static void srcu_gp_end(struct srcu_struct *ssp)
bool cbs;
bool last_lvl;
int cpu;
- unsigned long flags;
unsigned long gpseq;
int idx;
unsigned long mask;
@@ -907,12 +909,12 @@ static void srcu_gp_end(struct srcu_struct *ssp)
if (!(gpseq & counter_wrap_check))
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(ssp->sda, cpu);
- spin_lock_irqsave_rcu_node(sdp, flags);
+ spin_lock_irq_rcu_node(sdp);
if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100))
sdp->srcu_gp_seq_needed = gpseq;
if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100))
sdp->srcu_gp_seq_needed_exp = gpseq;
- spin_unlock_irqrestore_rcu_node(sdp, flags);
+ spin_unlock_irq_rcu_node(sdp);
}
/* Callback initiation done, allow grace periods after next. */
@@ -1540,7 +1542,8 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
*/
bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
{
- if (!rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie))
+ if (cookie != SRCU_GET_STATE_COMPLETED &&
+ !rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie))
return false;
// Ensure that the end of the SRCU grace period happens before
// any subsequent code that the caller might execute.
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 6c2bd9001adc..da60a9947c00 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -122,7 +122,7 @@ void rcu_sync_enter(struct rcu_sync *rsp)
* we are called at early boot time but this shouldn't happen.
*/
}
- WRITE_ONCE(rsp->gp_count, rsp->gp_count + 1);
+ rsp->gp_count++;
spin_unlock_irq(&rsp->rss_lock);
if (gp_state == GP_IDLE) {
@@ -151,15 +151,11 @@ void rcu_sync_enter(struct rcu_sync *rsp)
*/
void rcu_sync_exit(struct rcu_sync *rsp)
{
- int gpc;
-
WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE);
- WARN_ON_ONCE(READ_ONCE(rsp->gp_count) == 0);
spin_lock_irq(&rsp->rss_lock);
- gpc = rsp->gp_count - 1;
- WRITE_ONCE(rsp->gp_count, gpc);
- if (!gpc) {
+ WARN_ON_ONCE(rsp->gp_count == 0);
+ if (!--rsp->gp_count) {
if (rsp->gp_state == GP_PASSED) {
WRITE_ONCE(rsp->gp_state, GP_EXIT);
rcu_sync_call(rsp);
@@ -178,10 +174,10 @@ void rcu_sync_dtor(struct rcu_sync *rsp)
{
int gp_state;
- WARN_ON_ONCE(READ_ONCE(rsp->gp_count));
WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED);
spin_lock_irq(&rsp->rss_lock);
+ WARN_ON_ONCE(rsp->gp_count);
if (rsp->gp_state == GP_REPLAY)
WRITE_ONCE(rsp->gp_state, GP_EXIT);
gp_state = rsp->gp_state;
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index e1bf33018e6d..ba3440a45b6d 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -858,7 +858,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
// not know to synchronize with this RCU Tasks grace period) have
// completed exiting. The synchronize_rcu() in rcu_tasks_postgp()
// will take care of any tasks stuck in the non-preemptible region
-// of do_exit() following its call to exit_tasks_rcu_stop().
+// of do_exit() following its call to exit_tasks_rcu_finish().
// check_all_holdout_tasks(), repeatedly until holdout list is empty:
// Scans the holdout list, attempting to identify a quiescent state
// for each task on the list. If there is a quiescent state, the
@@ -1220,7 +1220,7 @@ void exit_tasks_rcu_start(void)
* Remove the task from the "yet another list" because do_exit() is now
* non-preemptible, allowing synchronize_rcu() to wait beyond this point.
*/
-void exit_tasks_rcu_stop(void)
+void exit_tasks_rcu_finish(void)
{
unsigned long flags;
struct rcu_tasks_percpu *rtpcp;
@@ -1231,22 +1231,12 @@ void exit_tasks_rcu_stop(void)
raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
list_del_init(&t->rcu_tasks_exit_list);
raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
-}
-/*
- * Contribute to protect against tasklist scan blind spot while the
- * task is exiting and may be removed from the tasklist. See
- * corresponding synchronize_srcu() for further details.
- */
-void exit_tasks_rcu_finish(void)
-{
- exit_tasks_rcu_stop();
- exit_tasks_rcu_finish_trace(current);
+ exit_tasks_rcu_finish_trace(t);
}
#else /* #ifdef CONFIG_TASKS_RCU */
void exit_tasks_rcu_start(void) { }
-void exit_tasks_rcu_stop(void) { }
void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
#endif /* #else #ifdef CONFIG_TASKS_RCU */
@@ -1757,6 +1747,16 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop)
// allow safe access to the hop list.
for_each_online_cpu(cpu) {
rcu_read_lock();
+ // Note that cpu_curr_snapshot() picks up the target
+ // CPU's current task while its runqueue is locked with
+ // an smp_mb__after_spinlock(). This ensures that either
+ // the grace-period kthread will see that task's read-side
+ // critical section or the task will see the updater's pre-GP
+ // accesses. The trailing smp_mb() in cpu_curr_snapshot()
+ // does not currently play a role other than simplify
+ // that function's ordering semantics. If these simplified
+ // ordering semantics continue to be redundant, that smp_mb()
+ // might be removed.
t = cpu_curr_snapshot(cpu);
if (rcu_tasks_trace_pertask_prep(t, true))
trc_add_holdout(t, hop);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 28c7031711a3..e641cc681901 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -96,6 +96,7 @@ static struct rcu_state rcu_state = {
.ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
.srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work,
rcu_sr_normal_gp_cleanup_work),
+ .srs_cleanups_pending = ATOMIC_INIT(0),
};
/* Dump rcu_node combining tree at boot to verify correct setup. */
@@ -175,6 +176,9 @@ static int gp_init_delay;
module_param(gp_init_delay, int, 0444);
static int gp_cleanup_delay;
module_param(gp_cleanup_delay, int, 0444);
+static int nohz_full_patience_delay;
+module_param(nohz_full_patience_delay, int, 0444);
+static int nohz_full_patience_delay_jiffies;
// Add delay to rcu_read_unlock() for strict grace periods.
static int rcu_unlock_delay;
@@ -296,16 +300,6 @@ static void rcu_dynticks_eqs_online(void)
}
/*
- * Snapshot the ->dynticks counter with full ordering so as to allow
- * stable comparison of this counter with past and future snapshots.
- */
-static int rcu_dynticks_snap(int cpu)
-{
- smp_mb(); // Fundamental RCU ordering guarantee.
- return ct_dynticks_cpu_acquire(cpu);
-}
-
-/*
* Return true if the snapshot returned from rcu_dynticks_snap()
* indicates that RCU is in an extended quiescent state.
*/
@@ -321,7 +315,15 @@ static bool rcu_dynticks_in_eqs(int snap)
*/
static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap)
{
- return snap != rcu_dynticks_snap(rdp->cpu);
+ /*
+ * The first failing snapshot is already ordered against the accesses
+ * performed by the remote CPU after it exits idle.
+ *
+ * The second snapshot therefore only needs to order against accesses
+ * performed by the remote CPU prior to entering idle and therefore can
+ * rely solely on acquire semantics.
+ */
+ return snap != ct_dynticks_cpu_acquire(rdp->cpu);
}
/*
@@ -769,7 +771,18 @@ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
*/
static int dyntick_save_progress_counter(struct rcu_data *rdp)
{
- rdp->dynticks_snap = rcu_dynticks_snap(rdp->cpu);
+ /*
+ * Full ordering between remote CPU's post idle accesses and updater's
+ * accesses prior to current GP (and also the started GP sequence number)
+ * is enforced by rcu_seq_start() implicit barrier and even further by
+ * smp_mb__after_unlock_lock() barriers chained all the way throughout the
+ * rnp locking tree since rcu_gp_init() and up to the current leaf rnp
+ * locking.
+ *
+ * Ordering between remote CPU's pre idle accesses and post grace period
+ * updater's accesses is enforced by the below acquire semantic.
+ */
+ rdp->dynticks_snap = ct_dynticks_cpu_acquire(rdp->cpu);
if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti"));
rcu_gpnum_ovf(rdp->mynode, rdp);
@@ -1660,6 +1673,9 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
rcu_sr_put_wait_head(rcu);
}
+
+ /* Order list manipulations with atomic access. */
+ atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
}
/*
@@ -1667,7 +1683,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
*/
static void rcu_sr_normal_gp_cleanup(void)
{
- struct llist_node *wait_tail, *next, *rcu;
+ struct llist_node *wait_tail, *next = NULL, *rcu = NULL;
int done = 0;
wait_tail = rcu_state.srs_wait_tail;
@@ -1693,16 +1709,34 @@ static void rcu_sr_normal_gp_cleanup(void)
break;
}
- // concurrent sr_normal_gp_cleanup work might observe this update.
- smp_store_release(&rcu_state.srs_done_tail, wait_tail);
+ /*
+ * Fast path, no more users to process except putting the second last
+ * wait head if no inflight-workers. If there are in-flight workers,
+ * they will remove the last wait head.
+ *
+ * Note that the ACQUIRE orders atomic access with list manipulation.
+ */
+ if (wait_tail->next && wait_tail->next->next == NULL &&
+ rcu_sr_is_wait_head(wait_tail->next) &&
+ !atomic_read_acquire(&rcu_state.srs_cleanups_pending)) {
+ rcu_sr_put_wait_head(wait_tail->next);
+ wait_tail->next = NULL;
+ }
+
+ /* Concurrent sr_normal_gp_cleanup work might observe this update. */
ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);
+ smp_store_release(&rcu_state.srs_done_tail, wait_tail);
/*
* We schedule a work in order to perform a final processing
* of outstanding users(if still left) and releasing wait-heads
* added by rcu_sr_normal_gp_init() call.
*/
- queue_work(sync_wq, &rcu_state.srs_cleanup_work);
+ if (wait_tail->next) {
+ atomic_inc(&rcu_state.srs_cleanups_pending);
+ if (!queue_work(sync_wq, &rcu_state.srs_cleanup_work))
+ atomic_dec(&rcu_state.srs_cleanups_pending);
+ }
}
/*
@@ -1810,7 +1844,7 @@ static noinline_for_stack bool rcu_gp_init(void)
WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF);
/* Exclude CPU hotplug operations. */
rcu_for_each_leaf_node(rnp) {
- local_irq_save(flags);
+ local_irq_disable();
arch_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_rcu_node(rnp);
if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
@@ -1818,7 +1852,7 @@ static noinline_for_stack bool rcu_gp_init(void)
/* Nothing to do on this leaf rcu_node structure. */
raw_spin_unlock_rcu_node(rnp);
arch_spin_unlock(&rcu_state.ofl_lock);
- local_irq_restore(flags);
+ local_irq_enable();
continue;
}
@@ -1855,7 +1889,7 @@ static noinline_for_stack bool rcu_gp_init(void)
raw_spin_unlock_rcu_node(rnp);
arch_spin_unlock(&rcu_state.ofl_lock);
- local_irq_restore(flags);
+ local_irq_enable();
}
rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */
@@ -4313,11 +4347,15 @@ static int rcu_pending(int user)
return 1;
/* Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) */
- if ((user || rcu_is_cpu_rrupt_from_idle()) && rcu_nohz_full_cpu())
+ gp_in_progress = rcu_gp_in_progress();
+ if ((user || rcu_is_cpu_rrupt_from_idle() ||
+ (gp_in_progress &&
+ time_before(jiffies, READ_ONCE(rcu_state.gp_start) +
+ nohz_full_patience_delay_jiffies))) &&
+ rcu_nohz_full_cpu())
return 0;
/* Is the RCU core waiting for a quiescent state from this CPU? */
- gp_in_progress = rcu_gp_in_progress();
if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm && gp_in_progress)
return 1;
@@ -4767,7 +4805,7 @@ rcu_boot_init_percpu_data(int cpu)
rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
INIT_WORK(&rdp->strict_work, strict_work_handler);
WARN_ON_ONCE(ct->dynticks_nesting != 1);
- WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu)));
+ WARN_ON_ONCE(rcu_dynticks_in_eqs(ct_dynticks_cpu(cpu)));
rdp->barrier_seq_snap = rcu_state.barrier_sequence;
rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
rdp->rcu_ofl_gp_state = RCU_GP_CLEANED;
@@ -5110,11 +5148,15 @@ void rcutree_migrate_callbacks(int cpu)
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
bool needwake;
- if (rcu_rdp_is_offloaded(rdp) ||
- rcu_segcblist_empty(&rdp->cblist))
- return; /* No callbacks to migrate. */
+ if (rcu_rdp_is_offloaded(rdp))
+ return;
raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
+ if (rcu_segcblist_empty(&rdp->cblist)) {
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
+ return; /* No callbacks to migrate. */
+ }
+
WARN_ON_ONCE(rcu_rdp_cpu_online(rdp));
rcu_barrier_entrain(rdp);
my_rdp = this_cpu_ptr(&rcu_data);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bae7925c497f..fcf2b4aa3441 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -223,7 +223,6 @@ struct rcu_data {
struct swait_queue_head nocb_state_wq; /* For offloading state changes */
struct task_struct *nocb_gp_kthread;
raw_spinlock_t nocb_lock; /* Guard following pair of fields. */
- atomic_t nocb_lock_contended; /* Contention experienced. */
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
struct timer_list nocb_timer; /* Enforce finite deferral. */
unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */
@@ -420,6 +419,7 @@ struct rcu_state {
struct llist_node *srs_done_tail; /* ready for GP users. */
struct sr_wait_node srs_wait_nodes[SR_NORMAL_GP_WAIT_HEAD_MAX];
struct work_struct srs_cleanup_work;
+ atomic_t srs_cleanups_pending; /* srs inflight worker cleanups. */
};
/* Values for rcu_state structure's gp_flags field. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 8a1d9c8bd9f7..4acd29d16fdb 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -265,7 +265,12 @@ static bool sync_exp_work_done(unsigned long s)
{
if (rcu_exp_gp_seq_done(s)) {
trace_rcu_exp_grace_period(rcu_state.name, s, TPS("done"));
- smp_mb(); /* Ensure test happens before caller kfree(). */
+ /*
+ * Order GP completion with preceding accesses. Order also GP
+ * completion with post GP update side accesses. Pairs with
+ * rcu_seq_end().
+ */
+ smp_mb();
return true;
}
return false;
@@ -357,7 +362,21 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp)
!(rnp->qsmaskinitnext & mask)) {
mask_ofl_test |= mask;
} else {
- snap = rcu_dynticks_snap(cpu);
+ /*
+ * Full ordering between remote CPU's post idle accesses
+ * and updater's accesses prior to current GP (and also
+ * the started GP sequence number) is enforced by
+ * rcu_seq_start() implicit barrier, relayed by kworkers
+ * locking and even further by smp_mb__after_unlock_lock()
+ * barriers chained all the way throughout the rnp locking
+ * tree since sync_exp_reset_tree() and up to the current
+ * leaf rnp locking.
+ *
+ * Ordering between remote CPU's pre idle accesses and
+ * post grace period updater's accesses is enforced by the
+ * below acquire semantic.
+ */
+ snap = ct_dynticks_cpu_acquire(cpu);
if (rcu_dynticks_in_eqs(snap))
mask_ofl_test |= mask;
else
@@ -953,7 +972,6 @@ void synchronize_rcu_expedited(void)
rnp = rcu_get_root();
wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
sync_exp_work_done(s));
- smp_mb(); /* Work actions happen before return. */
/* Let the next expedited grace period start. */
mutex_unlock(&rcu_state.exp_mutex);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 3f85577bddd4..3ce30841119a 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -91,8 +91,7 @@ module_param(nocb_nobypass_lim_per_jiffy, int, 0);
/*
* Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the
- * lock isn't immediately available, increment ->nocb_lock_contended to
- * flag the contention.
+ * lock isn't immediately available, perform minimal sanity check.
*/
static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
__acquires(&rdp->nocb_bypass_lock)
@@ -100,29 +99,12 @@ static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
lockdep_assert_irqs_disabled();
if (raw_spin_trylock(&rdp->nocb_bypass_lock))
return;
- atomic_inc(&rdp->nocb_lock_contended);
+ /*
+ * Contention expected only when local enqueue collide with
+ * remote flush from kthreads.
+ */
WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
- smp_mb__after_atomic(); /* atomic_inc() before lock. */
raw_spin_lock(&rdp->nocb_bypass_lock);
- smp_mb__before_atomic(); /* atomic_dec() after lock. */
- atomic_dec(&rdp->nocb_lock_contended);
-}
-
-/*
- * Spinwait until the specified rcu_data structure's ->nocb_lock is
- * not contended. Please note that this is extremely special-purpose,
- * relying on the fact that at most two kthreads and one CPU contend for
- * this lock, and also that the two kthreads are guaranteed to have frequent
- * grace-period-duration time intervals between successive acquisitions
- * of the lock. This allows us to use an extremely simple throttling
- * mechanism, and further to apply it only to the CPU doing floods of
- * call_rcu() invocations. Don't try this at home!
- */
-static void rcu_nocb_wait_contended(struct rcu_data *rdp)
-{
- WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
- while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended)))
- cpu_relax();
}
/*
@@ -510,7 +492,6 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
}
// We need to use the bypass.
- rcu_nocb_wait_contended(rdp);
rcu_nocb_bypass_lock(rdp);
ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
@@ -635,8 +616,7 @@ static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
}
}
-static int nocb_gp_toggle_rdp(struct rcu_data *rdp,
- bool *wake_state)
+static int nocb_gp_toggle_rdp(struct rcu_data *rdp)
{
struct rcu_segcblist *cblist = &rdp->cblist;
unsigned long flags;
@@ -650,8 +630,6 @@ static int nocb_gp_toggle_rdp(struct rcu_data *rdp,
* We will handle this rdp until it ever gets de-offloaded.
*/
rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
- *wake_state = true;
ret = 1;
} else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) &&
rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
@@ -660,8 +638,6 @@ static int nocb_gp_toggle_rdp(struct rcu_data *rdp,
* We will ignore this rdp until it ever gets re-offloaded.
*/
rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
- *wake_state = true;
ret = 0;
} else {
WARN_ON_ONCE(1);
@@ -877,16 +853,15 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
}
if (rdp_toggling) {
- bool wake_state = false;
int ret;
- ret = nocb_gp_toggle_rdp(rdp_toggling, &wake_state);
+ ret = nocb_gp_toggle_rdp(rdp_toggling);
if (ret == 1)
list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp);
else if (ret == 0)
list_del(&rdp_toggling->nocb_entry_rdp);
- if (wake_state)
- swake_up_one(&rdp_toggling->nocb_state_wq);
+
+ swake_up_one(&rdp_toggling->nocb_state_wq);
}
my_rdp->nocb_gp_seq = -1;
@@ -913,16 +888,9 @@ static int rcu_nocb_gp_kthread(void *arg)
return 0;
}
-static inline bool nocb_cb_can_run(struct rcu_data *rdp)
-{
- u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB;
-
- return rcu_segcblist_test_flags(&rdp->cblist, flags);
-}
-
static inline bool nocb_cb_wait_cond(struct rcu_data *rdp)
{
- return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep);
+ return !READ_ONCE(rdp->nocb_cb_sleep) || kthread_should_park();
}
/*
@@ -934,21 +902,19 @@ static void nocb_cb_wait(struct rcu_data *rdp)
struct rcu_segcblist *cblist = &rdp->cblist;
unsigned long cur_gp_seq;
unsigned long flags;
- bool needwake_state = false;
bool needwake_gp = false;
- bool can_sleep = true;
struct rcu_node *rnp = rdp->mynode;
- do {
- swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
- nocb_cb_wait_cond(rdp));
-
- if (READ_ONCE(rdp->nocb_cb_sleep)) {
- WARN_ON(signal_pending(current));
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
- }
- } while (!nocb_cb_can_run(rdp));
+ swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
+ nocb_cb_wait_cond(rdp));
+ if (kthread_should_park()) {
+ kthread_parkme();
+ } else if (READ_ONCE(rdp->nocb_cb_sleep)) {
+ WARN_ON(signal_pending(current));
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+ }
+ WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp));
local_irq_save(flags);
rcu_momentary_dyntick_idle();
@@ -971,37 +937,16 @@ static void nocb_cb_wait(struct rcu_data *rdp)
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
}
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) {
- rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB);
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
- needwake_state = true;
- }
- if (rcu_segcblist_ready_cbs(cblist))
- can_sleep = false;
+ if (!rcu_segcblist_ready_cbs(cblist)) {
+ WRITE_ONCE(rdp->nocb_cb_sleep, true);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
} else {
- /*
- * De-offloading. Clear our flag and notify the de-offload worker.
- * We won't touch the callbacks and keep sleeping until we ever
- * get re-offloaded.
- */
- WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB));
- rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB);
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
- needwake_state = true;
+ WRITE_ONCE(rdp->nocb_cb_sleep, false);
}
- WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep);
-
- if (rdp->nocb_cb_sleep)
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
-
rcu_nocb_unlock_irqrestore(rdp, flags);
if (needwake_gp)
rcu_gp_kthread_wake();
-
- if (needwake_state)
- swake_up_one(&rdp->nocb_state_wq);
}
/*
@@ -1094,17 +1039,8 @@ static int rdp_offload_toggle(struct rcu_data *rdp,
bool wake_gp = false;
rcu_segcblist_offload(cblist, offload);
-
- if (rdp->nocb_cb_sleep)
- rdp->nocb_cb_sleep = false;
rcu_nocb_unlock_irqrestore(rdp, flags);
- /*
- * Ignore former value of nocb_cb_sleep and force wake up as it could
- * have been spuriously set to false already.
- */
- swake_up_one(&rdp->nocb_cb_wq);
-
raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
// Queue this rdp for add/del to/from the list to iterate on rcuog
WRITE_ONCE(rdp_gp->nocb_toggling_rdp, rdp);
@@ -1161,19 +1097,11 @@ static long rcu_nocb_rdp_deoffload(void *arg)
if (wake_gp)
wake_up_process(rdp_gp->nocb_gp_kthread);
- /*
- * If rcuo[p] kthread spawn failed, directly remove SEGCBLIST_KTHREAD_CB.
- * Just wait SEGCBLIST_KTHREAD_GP to be cleared by rcuog.
- */
- if (!rdp->nocb_cb_kthread) {
- rcu_nocb_lock_irqsave(rdp, flags);
- rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
- rcu_nocb_unlock_irqrestore(rdp, flags);
- }
-
swait_event_exclusive(rdp->nocb_state_wq,
- !rcu_segcblist_test_flags(cblist,
- SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP));
+ !rcu_segcblist_test_flags(cblist,
+ SEGCBLIST_KTHREAD_GP));
+ if (rdp->nocb_cb_kthread)
+ kthread_park(rdp->nocb_cb_kthread);
} else {
/*
* No kthread to clear the flags for us or remove the rdp from the nocb list
@@ -1181,8 +1109,7 @@ static long rcu_nocb_rdp_deoffload(void *arg)
* but we stick to paranoia in this rare path.
*/
rcu_nocb_lock_irqsave(rdp, flags);
- rcu_segcblist_clear_flags(&rdp->cblist,
- SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP);
+ rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
rcu_nocb_unlock_irqrestore(rdp, flags);
list_del(&rdp->nocb_entry_rdp);
@@ -1282,8 +1209,10 @@ static long rcu_nocb_rdp_offload(void *arg)
wake_gp = rdp_offload_toggle(rdp, true, flags);
if (wake_gp)
wake_up_process(rdp_gp->nocb_gp_kthread);
+
+ kthread_unpark(rdp->nocb_cb_kthread);
+
swait_event_exclusive(rdp->nocb_state_wq,
- rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
/*
@@ -1468,7 +1397,7 @@ void __init rcu_init_nohz(void)
if (rcu_segcblist_empty(&rdp->cblist))
rcu_segcblist_init(&rdp->cblist);
rcu_segcblist_offload(&rdp->cblist, true);
- rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP);
+ rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_RCU_CORE);
}
rcu_organize_nocb_kthreads();
@@ -1526,11 +1455,16 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
/* Spawn the kthread for this CPU. */
- t = kthread_run(rcu_nocb_cb_kthread, rdp,
- "rcuo%c/%d", rcu_state.abbr, cpu);
+ t = kthread_create(rcu_nocb_cb_kthread, rdp,
+ "rcuo%c/%d", rcu_state.abbr, cpu);
if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
goto end;
+ if (rcu_rdp_is_offloaded(rdp))
+ wake_up_process(t);
+ else
+ kthread_park(t);
+
if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio)
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
@@ -1678,12 +1612,11 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
- pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
+ pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
rdp->cpu, rdp->nocb_gp_rdp->cpu,
nocb_next_rdp ? nocb_next_rdp->cpu : -1,
"kK"[!!rdp->nocb_cb_kthread],
"bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
- "cC"[!!atomic_read(&rdp->nocb_lock_contended)],
"lL"[raw_spin_is_locked(&rdp->nocb_lock)],
"sS"[!!rdp->nocb_cb_sleep],
".W"[swait_active(&rdp->nocb_cb_wq)],
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 340bbefe5f65..c569da65b421 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -28,8 +28,8 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
!(lockdep_is_held(&rcu_state.barrier_mutex) ||
(IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) ||
rcu_lockdep_is_held_nocb(rdp) ||
- (rdp == this_cpu_ptr(&rcu_data) &&
- !(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) ||
+ (!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) &&
+ rdp == this_cpu_ptr(&rcu_data)) ||
rcu_current_is_nocb_kthread(rdp)),
"Unsafe read of RCU_NOCB offloaded state"
);
@@ -93,6 +93,16 @@ static void __init rcu_bootup_announce_oddness(void)
pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
if (gp_cleanup_delay)
pr_info("\tRCU debug GP cleanup slowdown %d jiffies.\n", gp_cleanup_delay);
+ if (nohz_full_patience_delay < 0) {
+ pr_info("\tRCU NOCB CPU patience negative (%d), resetting to zero.\n", nohz_full_patience_delay);
+ nohz_full_patience_delay = 0;
+ } else if (nohz_full_patience_delay > 5 * MSEC_PER_SEC) {
+ pr_info("\tRCU NOCB CPU patience too large (%d), resetting to %ld.\n", nohz_full_patience_delay, 5 * MSEC_PER_SEC);
+ nohz_full_patience_delay = 5 * MSEC_PER_SEC;
+ } else if (nohz_full_patience_delay) {
+ pr_info("\tRCU NOCB CPU patience set to %d milliseconds.\n", nohz_full_patience_delay);
+ }
+ nohz_full_patience_delay_jiffies = msecs_to_jiffies(nohz_full_patience_delay);
if (!use_softirq)
pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n");
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 460efecd077b..4b0e9d7c4c68 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -501,7 +501,7 @@ static void print_cpu_stall_info(int cpu)
}
delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
falsepositive = rcu_is_gp_kthread_starving(NULL) &&
- rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu));
+ rcu_dynticks_in_eqs(ct_dynticks_cpu(cpu));
rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j);
if (rcuc_starved)
// Print signed value, as negative values indicate a probable bug.
@@ -515,7 +515,7 @@ static void print_cpu_stall_info(int cpu)
rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
"!."[!delta],
ticks_value, ticks_title,
- rcu_dynticks_snap(cpu) & 0xffff,
+ ct_dynticks_cpu(cpu) & 0xffff,
ct_dynticks_nesting_cpu(cpu), ct_dynticks_nmi_nesting_cpu(cpu),
rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
diff --git a/kernel/resource.c b/kernel/resource.c
index fcbca39dbc45..14777afb0a99 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -48,14 +48,6 @@ struct resource iomem_resource = {
};
EXPORT_SYMBOL(iomem_resource);
-/* constraints to be met while allocating resources */
-struct resource_constraint {
- resource_size_t min, max, align;
- resource_size_t (*alignf)(void *, const struct resource *,
- resource_size_t, resource_size_t);
- void *alignf_data;
-};
-
static DEFINE_RWLOCK(resource_lock);
static struct resource *next_resource(struct resource *p, bool skip_children)
@@ -610,14 +602,6 @@ void __weak arch_remove_reservations(struct resource *avail)
{
}
-static resource_size_t simple_align_resource(void *data,
- const struct resource *avail,
- resource_size_t size,
- resource_size_t align)
-{
- return avail->start;
-}
-
static void resource_clip(struct resource *res, resource_size_t min,
resource_size_t max)
{
@@ -628,16 +612,16 @@ static void resource_clip(struct resource *res, resource_size_t min,
}
/*
- * Find empty slot in the resource tree with the given range and
+ * Find empty space in the resource tree with the given range and
* alignment constraints
*/
-static int __find_resource(struct resource *root, struct resource *old,
- struct resource *new,
- resource_size_t size,
- struct resource_constraint *constraint)
+static int __find_resource_space(struct resource *root, struct resource *old,
+ struct resource *new, resource_size_t size,
+ struct resource_constraint *constraint)
{
struct resource *this = root->child;
struct resource tmp = *new, avail, alloc;
+ resource_alignf alignf = constraint->alignf;
tmp.start = root->start;
/*
@@ -666,8 +650,12 @@ static int __find_resource(struct resource *root, struct resource *old,
avail.flags = new->flags & ~IORESOURCE_UNSET;
if (avail.start >= tmp.start) {
alloc.flags = avail.flags;
- alloc.start = constraint->alignf(constraint->alignf_data, &avail,
- size, constraint->align);
+ if (alignf) {
+ alloc.start = alignf(constraint->alignf_data,
+ &avail, size, constraint->align);
+ } else {
+ alloc.start = avail.start;
+ }
alloc.end = alloc.start + size - 1;
if (alloc.start <= alloc.end &&
resource_contains(&avail, &alloc)) {
@@ -687,15 +675,27 @@ next: if (!this || this->end == root->end)
return -EBUSY;
}
-/*
- * Find empty slot in the resource tree given range and alignment.
+/**
+ * find_resource_space - Find empty space in the resource tree
+ * @root: Root resource descriptor
+ * @new: Resource descriptor awaiting an empty resource space
+ * @size: The minimum size of the empty space
+ * @constraint: The range and alignment constraints to be met
+ *
+ * Finds an empty space under @root in the resource tree satisfying range and
+ * alignment @constraints.
+ *
+ * Return:
+ * * %0 - if successful, @new members start, end, and flags are altered.
+ * * %-EBUSY - if no empty space was found.
*/
-static int find_resource(struct resource *root, struct resource *new,
+int find_resource_space(struct resource *root, struct resource *new,
resource_size_t size,
- struct resource_constraint *constraint)
+ struct resource_constraint *constraint)
{
- return __find_resource(root, NULL, new, size, constraint);
+ return __find_resource_space(root, NULL, new, size, constraint);
}
+EXPORT_SYMBOL_GPL(find_resource_space);
/**
* reallocate_resource - allocate a slot in the resource tree given range & alignment.
@@ -717,7 +717,7 @@ static int reallocate_resource(struct resource *root, struct resource *old,
write_lock(&resource_lock);
- if ((err = __find_resource(root, old, &new, newsize, constraint)))
+ if ((err = __find_resource_space(root, old, &new, newsize, constraint)))
goto out;
if (resource_contains(&new, old)) {
@@ -761,18 +761,12 @@ out:
int allocate_resource(struct resource *root, struct resource *new,
resource_size_t size, resource_size_t min,
resource_size_t max, resource_size_t align,
- resource_size_t (*alignf)(void *,
- const struct resource *,
- resource_size_t,
- resource_size_t),
+ resource_alignf alignf,
void *alignf_data)
{
int err;
struct resource_constraint constraint;
- if (!alignf)
- alignf = simple_align_resource;
-
constraint.min = min;
constraint.max = max;
constraint.align = align;
@@ -786,7 +780,7 @@ int allocate_resource(struct resource *root, struct resource *new,
}
write_lock(&resource_lock);
- err = find_resource(root, new, size, &constraint);
+ err = find_resource_space(root, new, size, &constraint);
if (err >= 0 && __request_resource(root, new))
err = -EBUSY;
write_unlock(&resource_lock);
diff --git a/kernel/resource_kunit.c b/kernel/resource_kunit.c
index 58ab9f914602..0e509985a44a 100644
--- a/kernel/resource_kunit.c
+++ b/kernel/resource_kunit.c
@@ -149,4 +149,5 @@ static struct kunit_suite resource_test_suite = {
};
kunit_test_suite(resource_test_suite);
+MODULE_DESCRIPTION("I/O Port & Memory Resource manager unit tests");
MODULE_LICENSE("GPL");
diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 59032aaccd18..44e83a646264 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -43,6 +43,7 @@
#define SCFTORTOUT_ERRSTRING(s, x...) pr_alert(SCFTORT_FLAG "!!! " s "\n", ## x)
+MODULE_DESCRIPTION("Torture tests on the smp_call_function() family of primitives");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>");
@@ -67,7 +68,7 @@ torture_param(int, weight_many_wait, -1, "Testing weight for multi-CPU operation
torture_param(int, weight_all, -1, "Testing weight for all-CPU no-wait operations.");
torture_param(int, weight_all_wait, -1, "Testing weight for all-CPU operations.");
-char *torture_type = "";
+static char *torture_type = "";
#ifdef MODULE
# define SCFTORT_SHUTDOWN 0
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index d9dc9ab3773f..39c315182b35 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -52,3 +52,4 @@
#include "cputime.c"
#include "deadline.c"
+#include "syscalls.c"
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 3c6193de9cde..a09655b48140 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -340,7 +340,7 @@ again:
this_clock = sched_clock_local(my_scd);
/*
* We must enforce atomic readout on 32-bit, otherwise the
- * update on the remote CPU can hit inbetween the readout of
+ * update on the remote CPU can hit in between the readout of
* the low 32-bit and the high 32-bit portion.
*/
remote_clock = cmpxchg64(&scd->clock, 0, 0);
@@ -444,7 +444,7 @@ notrace void sched_clock_tick_stable(void)
}
/*
- * We are going deep-idle (irqs are disabled):
+ * We are going deep-idle (IRQs are disabled):
*/
notrace void sched_clock_idle_sleep_event(void)
{
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bcf2c4cc0522..a9f655025607 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2,9 +2,10 @@
/*
* kernel/sched/core.c
*
- * Core kernel scheduler code and related syscalls
+ * Core kernel CPU scheduler code
*
* Copyright (C) 1991-2002 Linus Torvalds
+ * Copyright (C) 1998-2024 Ingo Molnar, Red Hat
*/
#include <linux/highmem.h>
#include <linux/hrtimer_api.h>
@@ -706,14 +707,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
/*
* Since irq_time is only updated on {soft,}irq_exit, we might run into
* this case when a previous update_rq_clock() happened inside a
- * {soft,}irq region.
+ * {soft,}IRQ region.
*
* When this happens, we stop ->clock_task and only update the
* prev_irq_time stamp to account for the part that fit, so that a next
* update will consume the rest. This ensures ->clock_task is
* monotonic.
*
- * It does however cause some slight miss-attribution of {soft,}irq
+ * It does however cause some slight miss-attribution of {soft,}IRQ
* time, a more accurate solution would be to update the irq_time using
* the current rq->clock timestamp, except that would require using
* atomic ops.
@@ -723,7 +724,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
- psi_account_irqtime(rq->curr, irq_delta);
delayacct_irq(rq->curr, irq_delta);
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
@@ -826,7 +826,7 @@ static void __hrtick_start(void *arg)
/*
* Called to set the hrtick timer state.
*
- * called with rq->lock held and irqs disabled
+ * called with rq->lock held and IRQs disabled
*/
void hrtick_start(struct rq *rq, u64 delay)
{
@@ -850,7 +850,7 @@ void hrtick_start(struct rq *rq, u64 delay)
/*
* Called to set the hrtick timer state.
*
- * called with rq->lock held and irqs disabled
+ * called with rq->lock held and IRQs disabled
*/
void hrtick_start(struct rq *rq, u64 delay)
{
@@ -884,7 +884,7 @@ static inline void hrtick_rq_init(struct rq *rq)
#endif /* CONFIG_SCHED_HRTICK */
/*
- * cmpxchg based fetch_or, macro so it works for different integer types
+ * try_cmpxchg based fetch_or() macro so it works for different integer types:
*/
#define fetch_or(ptr, mask) \
({ \
@@ -1081,7 +1081,7 @@ void resched_cpu(int cpu)
*
* We don't do similar optimization for completely idle system, as
* selecting an idle CPU will add more delays to the timers than intended
- * (as that CPU's timer base may not be uptodate wrt jiffies etc).
+ * (as that CPU's timer base may not be up to date wrt jiffies etc).
*/
int get_nohz_timer_target(void)
{
@@ -1141,7 +1141,7 @@ static void wake_up_idle_cpu(int cpu)
* nohz functions that would need to follow TIF_NR_POLLING
* clearing:
*
- * - On most archs, a simple fetch_or on ti::flags with a
+ * - On most architectures, a simple fetch_or on ti::flags with a
* "0" value would be enough to know if an IPI needs to be sent.
*
* - x86 needs to perform a last need_resched() check between
@@ -1324,30 +1324,27 @@ int tg_nop(struct task_group *tg, void *data)
}
#endif
-static void set_load_weight(struct task_struct *p, bool update_load)
+void set_load_weight(struct task_struct *p, bool update_load)
{
int prio = p->static_prio - MAX_RT_PRIO;
- struct load_weight *load = &p->se.load;
+ struct load_weight lw;
- /*
- * SCHED_IDLE tasks get minimal weight:
- */
if (task_has_idle_policy(p)) {
- load->weight = scale_load(WEIGHT_IDLEPRIO);
- load->inv_weight = WMULT_IDLEPRIO;
- return;
+ lw.weight = scale_load(WEIGHT_IDLEPRIO);
+ lw.inv_weight = WMULT_IDLEPRIO;
+ } else {
+ lw.weight = scale_load(sched_prio_to_weight[prio]);
+ lw.inv_weight = sched_prio_to_wmult[prio];
}
/*
* SCHED_OTHER tasks have to update their load when changing their
* weight
*/
- if (update_load && p->sched_class == &fair_sched_class) {
- reweight_task(p, prio);
- } else {
- load->weight = scale_load(sched_prio_to_weight[prio]);
- load->inv_weight = sched_prio_to_wmult[prio];
- }
+ if (update_load && p->sched_class == &fair_sched_class)
+ reweight_task(p, &lw);
+ else
+ p->se.load = lw;
}
#ifdef CONFIG_UCLAMP_TASK
@@ -1384,7 +1381,7 @@ static unsigned int __maybe_unused sysctl_sched_uclamp_util_max = SCHED_CAPACITY
* This knob will not override the system default sched_util_clamp_min defined
* above.
*/
-static unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
+unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
/* All clamps are required to be less or equal than these values */
static struct uclamp_se uclamp_default[UCLAMP_CNT];
@@ -1409,32 +1406,6 @@ static struct uclamp_se uclamp_default[UCLAMP_CNT];
*/
DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
-/* Integer rounded range for each bucket */
-#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
-
-#define for_each_clamp_id(clamp_id) \
- for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
-
-static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
-{
- return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
-}
-
-static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
-{
- if (clamp_id == UCLAMP_MIN)
- return 0;
- return SCHED_CAPACITY_SCALE;
-}
-
-static inline void uclamp_se_set(struct uclamp_se *uc_se,
- unsigned int value, bool user_defined)
-{
- uc_se->value = value;
- uc_se->bucket_id = uclamp_bucket_id(value);
- uc_se->user_defined = user_defined;
-}
-
static inline unsigned int
uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
unsigned int clamp_value)
@@ -1676,7 +1647,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
rq_clamp = uclamp_rq_get(rq, clamp_id);
/*
* Defensive programming: this should never happen. If it happens,
- * e.g. due to future modification, warn and fixup the expected value.
+ * e.g. due to future modification, warn and fix up the expected value.
*/
SCHED_WARN_ON(bucket->value > rq_clamp);
if (bucket->value >= rq_clamp) {
@@ -1835,7 +1806,7 @@ static void uclamp_sync_util_min_rt_default(void)
uclamp_update_util_min_rt_default(p);
}
-static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
+static int sysctl_sched_uclamp_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
bool update_root_tg = false;
@@ -1898,107 +1869,6 @@ undo:
}
#endif
-static int uclamp_validate(struct task_struct *p,
- const struct sched_attr *attr)
-{
- int util_min = p->uclamp_req[UCLAMP_MIN].value;
- int util_max = p->uclamp_req[UCLAMP_MAX].value;
-
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
- util_min = attr->sched_util_min;
-
- if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
- return -EINVAL;
- }
-
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
- util_max = attr->sched_util_max;
-
- if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
- return -EINVAL;
- }
-
- if (util_min != -1 && util_max != -1 && util_min > util_max)
- return -EINVAL;
-
- /*
- * We have valid uclamp attributes; make sure uclamp is enabled.
- *
- * We need to do that here, because enabling static branches is a
- * blocking operation which obviously cannot be done while holding
- * scheduler locks.
- */
- static_branch_enable(&sched_uclamp_used);
-
- return 0;
-}
-
-static bool uclamp_reset(const struct sched_attr *attr,
- enum uclamp_id clamp_id,
- struct uclamp_se *uc_se)
-{
- /* Reset on sched class change for a non user-defined clamp value. */
- if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
- !uc_se->user_defined)
- return true;
-
- /* Reset on sched_util_{min,max} == -1. */
- if (clamp_id == UCLAMP_MIN &&
- attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
- attr->sched_util_min == -1) {
- return true;
- }
-
- if (clamp_id == UCLAMP_MAX &&
- attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
- attr->sched_util_max == -1) {
- return true;
- }
-
- return false;
-}
-
-static void __setscheduler_uclamp(struct task_struct *p,
- const struct sched_attr *attr)
-{
- enum uclamp_id clamp_id;
-
- for_each_clamp_id(clamp_id) {
- struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
- unsigned int value;
-
- if (!uclamp_reset(attr, clamp_id, uc_se))
- continue;
-
- /*
- * RT by default have a 100% boost value that could be modified
- * at runtime.
- */
- if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
- value = sysctl_sched_uclamp_util_min_rt_default;
- else
- value = uclamp_none(clamp_id);
-
- uclamp_se_set(uc_se, value, false);
-
- }
-
- if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
- return;
-
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
- attr->sched_util_min != -1) {
- uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
- attr->sched_util_min, true);
- }
-
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
- attr->sched_util_max != -1) {
- uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
- attr->sched_util_max, true);
- }
-}
-
static void uclamp_fork(struct task_struct *p)
{
enum uclamp_id clamp_id;
@@ -2066,13 +1936,6 @@ static void __init init_uclamp(void)
#else /* !CONFIG_UCLAMP_TASK */
static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
-static inline int uclamp_validate(struct task_struct *p,
- const struct sched_attr *attr)
-{
- return -EOPNOTSUPP;
-}
-static void __setscheduler_uclamp(struct task_struct *p,
- const struct sched_attr *attr) { }
static inline void uclamp_fork(struct task_struct *p) { }
static inline void uclamp_post_fork(struct task_struct *p) { }
static inline void init_uclamp(void) { }
@@ -2102,7 +1965,7 @@ unsigned long get_wchan(struct task_struct *p)
return ip;
}
-static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (!(flags & ENQUEUE_NOCLOCK))
update_rq_clock(rq);
@@ -2119,7 +1982,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
sched_core_enqueue(rq, p);
}
-static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (sched_core_enabled(rq))
sched_core_dequeue(rq, p, flags);
@@ -2157,52 +2020,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
dequeue_task(rq, p, flags);
}
-static inline int __normal_prio(int policy, int rt_prio, int nice)
-{
- int prio;
-
- if (dl_policy(policy))
- prio = MAX_DL_PRIO - 1;
- else if (rt_policy(policy))
- prio = MAX_RT_PRIO - 1 - rt_prio;
- else
- prio = NICE_TO_PRIO(nice);
-
- return prio;
-}
-
-/*
- * Calculate the expected normal priority: i.e. priority
- * without taking RT-inheritance into account. Might be
- * boosted by interactivity modifiers. Changes upon fork,
- * setprio syscalls, and whenever the interactivity
- * estimator recalculates.
- */
-static inline int normal_prio(struct task_struct *p)
-{
- return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
-}
-
-/*
- * Calculate the current priority, i.e. the priority
- * taken into account by the scheduler. This value might
- * be boosted by RT tasks, or might be boosted by
- * interactivity modifiers. Will be RT if the task got
- * RT-boosted. If not then it returns p->normal_prio.
- */
-static int effective_prio(struct task_struct *p)
-{
- p->normal_prio = normal_prio(p);
- /*
- * If we are RT tasks or we were boosted to RT priority,
- * keep the priority unchanged. Otherwise, update priority
- * to the normal priority:
- */
- if (!rt_prio(p->prio))
- return p->normal_prio;
- return p->prio;
-}
-
/**
* task_curr - is this task currently executing on a CPU?
* @p: the task in question.
@@ -2221,9 +2038,9 @@ inline int task_curr(const struct task_struct *p)
* this means any call to check_class_changed() must be followed by a call to
* balance_callback().
*/
-static inline void check_class_changed(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class,
- int oldprio)
+void check_class_changed(struct rq *rq, struct task_struct *p,
+ const struct sched_class *prev_class,
+ int oldprio)
{
if (prev_class != p->sched_class) {
if (prev_class->switched_from)
@@ -2392,9 +2209,6 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
static void
__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
-static int __set_cpus_allowed_ptr(struct task_struct *p,
- struct affinity_context *ctx);
-
static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
{
struct affinity_context ac = {
@@ -2409,7 +2223,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
return;
/*
- * Violates locking rules! see comment in __do_set_cpus_allowed().
+ * Violates locking rules! See comment in __do_set_cpus_allowed().
*/
__do_set_cpus_allowed(p, &ac);
}
@@ -2576,7 +2390,7 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
}
/*
- * migration_cpu_stop - this will be executed by a highprio stopper thread
+ * migration_cpu_stop - this will be executed by a high-prio stopper thread
* and performs thread migration by bumping thread off CPU then
* 'pushing' onto another runqueue.
*/
@@ -2821,16 +2635,6 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
kfree_rcu((union cpumask_rcuhead *)ac.user_mask, rcu);
}
-static cpumask_t *alloc_user_cpus_ptr(int node)
-{
- /*
- * See do_set_cpus_allowed() above for the rcu_head usage.
- */
- int size = max_t(int, cpumask_size(), sizeof(struct rcu_head));
-
- return kmalloc_node(size, GFP_KERNEL, node);
-}
-
int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
int node)
{
@@ -3199,8 +3003,7 @@ out:
* task must not exit() & deallocate itself prematurely. The
* call is not atomic; no spinlocks may be held.
*/
-static int __set_cpus_allowed_ptr(struct task_struct *p,
- struct affinity_context *ctx)
+int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx)
{
struct rq_flags rf;
struct rq *rq;
@@ -3319,9 +3122,6 @@ out_free_mask:
free_cpumask_var(new_mask);
}
-static int
-__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
-
/*
* Restore the affinity of a task @p which was previously restricted by a
* call to force_compatible_cpus_allowed_ptr().
@@ -3701,12 +3501,6 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
#else /* CONFIG_SMP */
-static inline int __set_cpus_allowed_ptr(struct task_struct *p,
- struct affinity_context *ctx)
-{
- return set_cpus_allowed_ptr(p, ctx->new_mask);
-}
-
static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
static inline bool rq_has_pinned_tasks(struct rq *rq)
@@ -3714,11 +3508,6 @@ static inline bool rq_has_pinned_tasks(struct rq *rq)
return false;
}
-static inline cpumask_t *alloc_user_cpus_ptr(int node)
-{
- return NULL;
-}
-
#endif /* !CONFIG_SMP */
static void
@@ -3901,8 +3690,8 @@ void sched_ttwu_pending(void *arg)
* it is possible for select_idle_siblings() to stack a number
* of tasks on this CPU during that window.
*
- * It is ok to clear ttwu_pending when another task pending.
- * We will receive IPI after local irq enabled and then enqueue it.
+ * It is OK to clear ttwu_pending when another task pending.
+ * We will receive IPI after local IRQ enabled and then enqueue it.
* Since now nr_running > 0, idle_cpu() will always get correct result.
*/
WRITE_ONCE(rq->ttwu_pending, 0);
@@ -4467,12 +4256,7 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg)
* @cpu: The CPU on which to snapshot the task.
*
* Returns the task_struct pointer of the task "currently" running on
- * the specified CPU. If the same task is running on that CPU throughout,
- * the return value will be a pointer to that task's task_struct structure.
- * If the CPU did any context switches even vaguely concurrently with the
- * execution of this function, the return value will be a pointer to the
- * task_struct structure of a randomly chosen task that was running on
- * that CPU somewhere around the time that this function was executing.
+ * the specified CPU.
*
* If the specified CPU was offline, the return value is whatever it
* is, perhaps a pointer to the task_struct structure of that CPU's idle
@@ -4486,11 +4270,16 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg)
*/
struct task_struct *cpu_curr_snapshot(int cpu)
{
+ struct rq *rq = cpu_rq(cpu);
struct task_struct *t;
+ struct rq_flags rf;
- smp_mb(); /* Pairing determined by caller's synchronization design. */
+ rq_lock_irqsave(rq, &rf);
+ smp_mb__after_spinlock(); /* Pairing determined by caller's synchronization design. */
t = rcu_dereference(cpu_curr(cpu));
+ rq_unlock_irqrestore(rq, &rf);
smp_mb(); /* Pairing determined by caller's synchronization design. */
+
return t;
}
@@ -4603,7 +4392,7 @@ static void reset_memory_tiering(void)
}
}
-static int sysctl_numa_balancing(struct ctl_table *table, int write,
+static int sysctl_numa_balancing(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
@@ -4672,7 +4461,7 @@ out:
__setup("schedstats=", setup_schedstats);
#ifdef CONFIG_PROC_SYSCTL
-static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
+static int sysctl_schedstats(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
@@ -5095,7 +4884,7 @@ __splice_balance_callbacks(struct rq *rq, bool split)
return head;
}
-static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
+struct balance_callback *splice_balance_callbacks(struct rq *rq)
{
return __splice_balance_callbacks(rq, true);
}
@@ -5105,7 +4894,7 @@ static void __balance_callbacks(struct rq *rq)
do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
}
-static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)
+void balance_callbacks(struct rq *rq, struct balance_callback *head)
{
unsigned long flags;
@@ -5122,15 +4911,6 @@ static inline void __balance_callbacks(struct rq *rq)
{
}
-static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
-{
- return NULL;
-}
-
-static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)
-{
-}
-
#endif
static inline void
@@ -5233,7 +5013,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
*
* The context switch have flipped the stack from under us and restored the
* local variables which were saved when this task called schedule() in the
- * past. prev == current is still correct but we need to recalculate this_rq
+ * past. 'prev == current' is still correct but we need to recalculate this_rq
* because prev may have moved to another CPU.
*/
static struct rq *finish_task_switch(struct task_struct *prev)
@@ -5556,9 +5336,9 @@ EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
static inline void prefetch_curr_exec_start(struct task_struct *p)
{
#ifdef CONFIG_FAIR_GROUP_SCHED
- struct sched_entity *curr = (&p->se)->cfs_rq->curr;
+ struct sched_entity *curr = p->se.cfs_rq->curr;
#else
- struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
+ struct sched_entity *curr = task_rq(p)->cfs.curr;
#endif
prefetch(curr);
prefetch(&curr->exec_start);
@@ -5579,7 +5359,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
/*
* 64-bit doesn't need locks to atomically read a 64-bit value.
* So we have a optimization chance when the task's delta_exec is 0.
- * Reading ->on_cpu is racy, but this is ok.
+ * Reading ->on_cpu is racy, but this is OK.
*
* If we race with it leaving CPU, we'll take a lock. So we're correct.
* If we race with it entering CPU, unaccounted time is 0. This is
@@ -5665,7 +5445,7 @@ void sched_tick(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
- struct task_struct *curr = rq->curr;
+ struct task_struct *curr;
struct rq_flags rf;
unsigned long hw_pressure;
u64 resched_latency;
@@ -5677,6 +5457,9 @@ void sched_tick(void)
rq_lock(rq, &rf);
+ curr = rq->curr;
+ psi_account_irqtime(rq, curr, NULL);
+
update_rq_clock(rq);
hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
@@ -6737,6 +6520,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
++*switch_count;
migrate_disable_switch(rq, prev);
+ psi_account_irqtime(rq, prev, next);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
@@ -6853,7 +6637,7 @@ void __sched schedule_idle(void)
{
/*
* As this skips calling sched_submit_work(), which the idle task does
- * regardless because that function is a nop when the task is in a
+ * regardless because that function is a NOP when the task is in a
* TASK_RUNNING state, make sure this isn't used someplace that the
* current task can be in any other state. Note, idle is always in the
* TASK_RUNNING state.
@@ -7048,9 +6832,9 @@ EXPORT_SYMBOL(dynamic_preempt_schedule_notrace);
/*
* This is the entry point to schedule() from kernel preemption
- * off of irq context.
- * Note, that this is called and return with irqs disabled. This will
- * protect us against recursive calling from irq.
+ * off of IRQ context.
+ * Note, that this is called and return with IRQs disabled. This will
+ * protect us against recursive calling from IRQ contexts.
*/
asmlinkage __visible void __sched preempt_schedule_irq(void)
{
@@ -7080,7 +6864,7 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
}
EXPORT_SYMBOL(default_wake_function);
-static void __setscheduler_prio(struct task_struct *p, int prio)
+void __setscheduler_prio(struct task_struct *p, int prio)
{
if (dl_prio(prio))
p->sched_class = &dl_sched_class;
@@ -7120,21 +6904,6 @@ void rt_mutex_post_schedule(void)
lockdep_assert(fetch_and_set(current->sched_rt_mutex, 0));
}
-static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
-{
- if (pi_task)
- prio = min(prio, pi_task->prio);
-
- return prio;
-}
-
-static inline int rt_effective_prio(struct task_struct *p, int prio)
-{
- struct task_struct *pi_task = rt_mutex_get_top_task(p);
-
- return __rt_effective_prio(pi_task, prio);
-}
-
/*
* rt_mutex_setprio - set the current priority of a task
* @p: task to boost
@@ -7184,7 +6953,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
goto out_unlock;
/*
- * Idle task boosting is a nono in general. There is one
+ * Idle task boosting is a no-no in general. There is one
* exception, when PREEMPT_RT and NOHZ is active:
*
* The idle task calls get_next_timer_interrupt() and holds
@@ -7263,1325 +7032,8 @@ out_unlock:
preempt_enable();
}
-#else
-static inline int rt_effective_prio(struct task_struct *p, int prio)
-{
- return prio;
-}
#endif
-void set_user_nice(struct task_struct *p, long nice)
-{
- bool queued, running;
- struct rq *rq;
- int old_prio;
-
- if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
- return;
- /*
- * We have to be careful, if called from sys_setpriority(),
- * the task might be in the middle of scheduling on another CPU.
- */
- CLASS(task_rq_lock, rq_guard)(p);
- rq = rq_guard.rq;
-
- update_rq_clock(rq);
-
- /*
- * The RT priorities are set via sched_setscheduler(), but we still
- * allow the 'normal' nice value to be set - but as expected
- * it won't have any effect on scheduling until the task is
- * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
- */
- if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
- p->static_prio = NICE_TO_PRIO(nice);
- return;
- }
-
- queued = task_on_rq_queued(p);
- running = task_current(rq, p);
- if (queued)
- dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
- if (running)
- put_prev_task(rq, p);
-
- p->static_prio = NICE_TO_PRIO(nice);
- set_load_weight(p, true);
- old_prio = p->prio;
- p->prio = effective_prio(p);
-
- if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- if (running)
- set_next_task(rq, p);
-
- /*
- * If the task increased its priority or is running and
- * lowered its priority, then reschedule its CPU:
- */
- p->sched_class->prio_changed(rq, p, old_prio);
-}
-EXPORT_SYMBOL(set_user_nice);
-
-/*
- * is_nice_reduction - check if nice value is an actual reduction
- *
- * Similar to can_nice() but does not perform a capability check.
- *
- * @p: task
- * @nice: nice value
- */
-static bool is_nice_reduction(const struct task_struct *p, const int nice)
-{
- /* Convert nice value [19,-20] to rlimit style value [1,40]: */
- int nice_rlim = nice_to_rlimit(nice);
-
- return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
-}
-
-/*
- * can_nice - check if a task can reduce its nice value
- * @p: task
- * @nice: nice value
- */
-int can_nice(const struct task_struct *p, const int nice)
-{
- return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE);
-}
-
-#ifdef __ARCH_WANT_SYS_NICE
-
-/*
- * sys_nice - change the priority of the current process.
- * @increment: priority increment
- *
- * sys_setpriority is a more generic, but much slower function that
- * does similar things.
- */
-SYSCALL_DEFINE1(nice, int, increment)
-{
- long nice, retval;
-
- /*
- * Setpriority might change our priority at the same moment.
- * We don't have to worry. Conceptually one call occurs first
- * and we have a single winner.
- */
- increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
- nice = task_nice(current) + increment;
-
- nice = clamp_val(nice, MIN_NICE, MAX_NICE);
- if (increment < 0 && !can_nice(current, nice))
- return -EPERM;
-
- retval = security_task_setnice(current, nice);
- if (retval)
- return retval;
-
- set_user_nice(current, nice);
- return 0;
-}
-
-#endif
-
-/**
- * task_prio - return the priority value of a given task.
- * @p: the task in question.
- *
- * Return: The priority value as seen by users in /proc.
- *
- * sched policy return value kernel prio user prio/nice
- *
- * normal, batch, idle [0 ... 39] [100 ... 139] 0/[-20 ... 19]
- * fifo, rr [-2 ... -100] [98 ... 0] [1 ... 99]
- * deadline -101 -1 0
- */
-int task_prio(const struct task_struct *p)
-{
- return p->prio - MAX_RT_PRIO;
-}
-
-/**
- * idle_cpu - is a given CPU idle currently?
- * @cpu: the processor in question.
- *
- * Return: 1 if the CPU is currently idle. 0 otherwise.
- */
-int idle_cpu(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
-
- if (rq->curr != rq->idle)
- return 0;
-
- if (rq->nr_running)
- return 0;
-
-#ifdef CONFIG_SMP
- if (rq->ttwu_pending)
- return 0;
-#endif
-
- return 1;
-}
-
-/**
- * available_idle_cpu - is a given CPU idle for enqueuing work.
- * @cpu: the CPU in question.
- *
- * Return: 1 if the CPU is currently idle. 0 otherwise.
- */
-int available_idle_cpu(int cpu)
-{
- if (!idle_cpu(cpu))
- return 0;
-
- if (vcpu_is_preempted(cpu))
- return 0;
-
- return 1;
-}
-
-/**
- * idle_task - return the idle task for a given CPU.
- * @cpu: the processor in question.
- *
- * Return: The idle task for the CPU @cpu.
- */
-struct task_struct *idle_task(int cpu)
-{
- return cpu_rq(cpu)->idle;
-}
-
-#ifdef CONFIG_SCHED_CORE
-int sched_core_idle_cpu(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
-
- if (sched_core_enabled(rq) && rq->curr == rq->idle)
- return 1;
-
- return idle_cpu(cpu);
-}
-
-#endif
-
-#ifdef CONFIG_SMP
-/*
- * This function computes an effective utilization for the given CPU, to be
- * used for frequency selection given the linear relation: f = u * f_max.
- *
- * The scheduler tracks the following metrics:
- *
- * cpu_util_{cfs,rt,dl,irq}()
- * cpu_bw_dl()
- *
- * Where the cfs,rt and dl util numbers are tracked with the same metric and
- * synchronized windows and are thus directly comparable.
- *
- * The cfs,rt,dl utilization are the running times measured with rq->clock_task
- * which excludes things like IRQ and steal-time. These latter are then accrued
- * in the irq utilization.
- *
- * The DL bandwidth number otoh is not a measured metric but a value computed
- * based on the task model parameters and gives the minimal utilization
- * required to meet deadlines.
- */
-unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
- unsigned long *min,
- unsigned long *max)
-{
- unsigned long util, irq, scale;
- struct rq *rq = cpu_rq(cpu);
-
- scale = arch_scale_cpu_capacity(cpu);
-
- /*
- * Early check to see if IRQ/steal time saturates the CPU, can be
- * because of inaccuracies in how we track these -- see
- * update_irq_load_avg().
- */
- irq = cpu_util_irq(rq);
- if (unlikely(irq >= scale)) {
- if (min)
- *min = scale;
- if (max)
- *max = scale;
- return scale;
- }
-
- if (min) {
- /*
- * The minimum utilization returns the highest level between:
- * - the computed DL bandwidth needed with the IRQ pressure which
- * steals time to the deadline task.
- * - The minimum performance requirement for CFS and/or RT.
- */
- *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
-
- /*
- * When an RT task is runnable and uclamp is not used, we must
- * ensure that the task will run at maximum compute capacity.
- */
- if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
- *min = max(*min, scale);
- }
-
- /*
- * Because the time spend on RT/DL tasks is visible as 'lost' time to
- * CFS tasks and we use the same metric to track the effective
- * utilization (PELT windows are synchronized) we can directly add them
- * to obtain the CPU's actual utilization.
- */
- util = util_cfs + cpu_util_rt(rq);
- util += cpu_util_dl(rq);
-
- /*
- * The maximum hint is a soft bandwidth requirement, which can be lower
- * than the actual utilization because of uclamp_max requirements.
- */
- if (max)
- *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
-
- if (util >= scale)
- return scale;
-
- /*
- * There is still idle time; further improve the number by using the
- * irq metric. Because IRQ/steal time is hidden from the task clock we
- * need to scale the task numbers:
- *
- * max - irq
- * U' = irq + --------- * U
- * max
- */
- util = scale_irq_capacity(util, irq, scale);
- util += irq;
-
- return min(scale, util);
-}
-
-unsigned long sched_cpu_util(int cpu)
-{
- return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
-}
-#endif /* CONFIG_SMP */
-
-/**
- * find_process_by_pid - find a process with a matching PID value.
- * @pid: the pid in question.
- *
- * The task of @pid, if found. %NULL otherwise.
- */
-static struct task_struct *find_process_by_pid(pid_t pid)
-{
- return pid ? find_task_by_vpid(pid) : current;
-}
-
-static struct task_struct *find_get_task(pid_t pid)
-{
- struct task_struct *p;
- guard(rcu)();
-
- p = find_process_by_pid(pid);
- if (likely(p))
- get_task_struct(p);
-
- return p;
-}
-
-DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T),
- find_get_task(pid), pid_t pid)
-
-/*
- * sched_setparam() passes in -1 for its policy, to let the functions
- * it calls know not to change it.
- */
-#define SETPARAM_POLICY -1
-
-static void __setscheduler_params(struct task_struct *p,
- const struct sched_attr *attr)
-{
- int policy = attr->sched_policy;
-
- if (policy == SETPARAM_POLICY)
- policy = p->policy;
-
- p->policy = policy;
-
- if (dl_policy(policy))
- __setparam_dl(p, attr);
- else if (fair_policy(policy))
- p->static_prio = NICE_TO_PRIO(attr->sched_nice);
-
- /*
- * __sched_setscheduler() ensures attr->sched_priority == 0 when
- * !rt_policy. Always setting this ensures that things like
- * getparam()/getattr() don't report silly values for !rt tasks.
- */
- p->rt_priority = attr->sched_priority;
- p->normal_prio = normal_prio(p);
- set_load_weight(p, true);
-}
-
-/*
- * Check the target process has a UID that matches the current process's:
- */
-static bool check_same_owner(struct task_struct *p)
-{
- const struct cred *cred = current_cred(), *pcred;
- guard(rcu)();
-
- pcred = __task_cred(p);
- return (uid_eq(cred->euid, pcred->euid) ||
- uid_eq(cred->euid, pcred->uid));
-}
-
-/*
- * Allow unprivileged RT tasks to decrease priority.
- * Only issue a capable test if needed and only once to avoid an audit
- * event on permitted non-privileged operations:
- */
-static int user_check_sched_setscheduler(struct task_struct *p,
- const struct sched_attr *attr,
- int policy, int reset_on_fork)
-{
- if (fair_policy(policy)) {
- if (attr->sched_nice < task_nice(p) &&
- !is_nice_reduction(p, attr->sched_nice))
- goto req_priv;
- }
-
- if (rt_policy(policy)) {
- unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
-
- /* Can't set/change the rt policy: */
- if (policy != p->policy && !rlim_rtprio)
- goto req_priv;
-
- /* Can't increase priority: */
- if (attr->sched_priority > p->rt_priority &&
- attr->sched_priority > rlim_rtprio)
- goto req_priv;
- }
-
- /*
- * Can't set/change SCHED_DEADLINE policy at all for now
- * (safest behavior); in the future we would like to allow
- * unprivileged DL tasks to increase their relative deadline
- * or reduce their runtime (both ways reducing utilization)
- */
- if (dl_policy(policy))
- goto req_priv;
-
- /*
- * Treat SCHED_IDLE as nice 20. Only allow a switch to
- * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
- */
- if (task_has_idle_policy(p) && !idle_policy(policy)) {
- if (!is_nice_reduction(p, task_nice(p)))
- goto req_priv;
- }
-
- /* Can't change other user's priorities: */
- if (!check_same_owner(p))
- goto req_priv;
-
- /* Normal users shall not reset the sched_reset_on_fork flag: */
- if (p->sched_reset_on_fork && !reset_on_fork)
- goto req_priv;
-
- return 0;
-
-req_priv:
- if (!capable(CAP_SYS_NICE))
- return -EPERM;
-
- return 0;
-}
-
-static int __sched_setscheduler(struct task_struct *p,
- const struct sched_attr *attr,
- bool user, bool pi)
-{
- int oldpolicy = -1, policy = attr->sched_policy;
- int retval, oldprio, newprio, queued, running;
- const struct sched_class *prev_class;
- struct balance_callback *head;
- struct rq_flags rf;
- int reset_on_fork;
- int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
- struct rq *rq;
- bool cpuset_locked = false;
-
- /* The pi code expects interrupts enabled */
- BUG_ON(pi && in_interrupt());
-recheck:
- /* Double check policy once rq lock held: */
- if (policy < 0) {
- reset_on_fork = p->sched_reset_on_fork;
- policy = oldpolicy = p->policy;
- } else {
- reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
-
- if (!valid_policy(policy))
- return -EINVAL;
- }
-
- if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
- return -EINVAL;
-
- /*
- * Valid priorities for SCHED_FIFO and SCHED_RR are
- * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL,
- * SCHED_BATCH and SCHED_IDLE is 0.
- */
- if (attr->sched_priority > MAX_RT_PRIO-1)
- return -EINVAL;
- if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
- (rt_policy(policy) != (attr->sched_priority != 0)))
- return -EINVAL;
-
- if (user) {
- retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork);
- if (retval)
- return retval;
-
- if (attr->sched_flags & SCHED_FLAG_SUGOV)
- return -EINVAL;
-
- retval = security_task_setscheduler(p);
- if (retval)
- return retval;
- }
-
- /* Update task specific "requested" clamps */
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
- retval = uclamp_validate(p, attr);
- if (retval)
- return retval;
- }
-
- /*
- * SCHED_DEADLINE bandwidth accounting relies on stable cpusets
- * information.
- */
- if (dl_policy(policy) || dl_policy(p->policy)) {
- cpuset_locked = true;
- cpuset_lock();
- }
-
- /*
- * Make sure no PI-waiters arrive (or leave) while we are
- * changing the priority of the task:
- *
- * To be able to change p->policy safely, the appropriate
- * runqueue lock must be held.
- */
- rq = task_rq_lock(p, &rf);
- update_rq_clock(rq);
-
- /*
- * Changing the policy of the stop threads its a very bad idea:
- */
- if (p == rq->stop) {
- retval = -EINVAL;
- goto unlock;
- }
-
- /*
- * If not changing anything there's no need to proceed further,
- * but store a possible modification of reset_on_fork.
- */
- if (unlikely(policy == p->policy)) {
- if (fair_policy(policy) && attr->sched_nice != task_nice(p))
- goto change;
- if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
- goto change;
- if (dl_policy(policy) && dl_param_changed(p, attr))
- goto change;
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
- goto change;
-
- p->sched_reset_on_fork = reset_on_fork;
- retval = 0;
- goto unlock;
- }
-change:
-
- if (user) {
-#ifdef CONFIG_RT_GROUP_SCHED
- /*
- * Do not allow realtime tasks into groups that have no runtime
- * assigned.
- */
- if (rt_bandwidth_enabled() && rt_policy(policy) &&
- task_group(p)->rt_bandwidth.rt_runtime == 0 &&
- !task_group_is_autogroup(task_group(p))) {
- retval = -EPERM;
- goto unlock;
- }
-#endif
-#ifdef CONFIG_SMP
- if (dl_bandwidth_enabled() && dl_policy(policy) &&
- !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
- cpumask_t *span = rq->rd->span;
-
- /*
- * Don't allow tasks with an affinity mask smaller than
- * the entire root_domain to become SCHED_DEADLINE. We
- * will also fail if there's no bandwidth available.
- */
- if (!cpumask_subset(span, p->cpus_ptr) ||
- rq->rd->dl_bw.bw == 0) {
- retval = -EPERM;
- goto unlock;
- }
- }
-#endif
- }
-
- /* Re-check policy now with rq lock held: */
- if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
- policy = oldpolicy = -1;
- task_rq_unlock(rq, p, &rf);
- if (cpuset_locked)
- cpuset_unlock();
- goto recheck;
- }
-
- /*
- * If setscheduling to SCHED_DEADLINE (or changing the parameters
- * of a SCHED_DEADLINE task) we need to check if enough bandwidth
- * is available.
- */
- if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
- retval = -EBUSY;
- goto unlock;
- }
-
- p->sched_reset_on_fork = reset_on_fork;
- oldprio = p->prio;
-
- newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
- if (pi) {
- /*
- * Take priority boosted tasks into account. If the new
- * effective priority is unchanged, we just store the new
- * normal parameters and do not touch the scheduler class and
- * the runqueue. This will be done when the task deboost
- * itself.
- */
- newprio = rt_effective_prio(p, newprio);
- if (newprio == oldprio)
- queue_flags &= ~DEQUEUE_MOVE;
- }
-
- queued = task_on_rq_queued(p);
- running = task_current(rq, p);
- if (queued)
- dequeue_task(rq, p, queue_flags);
- if (running)
- put_prev_task(rq, p);
-
- prev_class = p->sched_class;
-
- if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
- __setscheduler_params(p, attr);
- __setscheduler_prio(p, newprio);
- }
- __setscheduler_uclamp(p, attr);
-
- if (queued) {
- /*
- * We enqueue to tail when the priority of a task is
- * increased (user space view).
- */
- if (oldprio < p->prio)
- queue_flags |= ENQUEUE_HEAD;
-
- enqueue_task(rq, p, queue_flags);
- }
- if (running)
- set_next_task(rq, p);
-
- check_class_changed(rq, p, prev_class, oldprio);
-
- /* Avoid rq from going away on us: */
- preempt_disable();
- head = splice_balance_callbacks(rq);
- task_rq_unlock(rq, p, &rf);
-
- if (pi) {
- if (cpuset_locked)
- cpuset_unlock();
- rt_mutex_adjust_pi(p);
- }
-
- /* Run balance callbacks after we've adjusted the PI chain: */
- balance_callbacks(rq, head);
- preempt_enable();
-
- return 0;
-
-unlock:
- task_rq_unlock(rq, p, &rf);
- if (cpuset_locked)
- cpuset_unlock();
- return retval;
-}
-
-static int _sched_setscheduler(struct task_struct *p, int policy,
- const struct sched_param *param, bool check)
-{
- struct sched_attr attr = {
- .sched_policy = policy,
- .sched_priority = param->sched_priority,
- .sched_nice = PRIO_TO_NICE(p->static_prio),
- };
-
- /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
- if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
- attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
- policy &= ~SCHED_RESET_ON_FORK;
- attr.sched_policy = policy;
- }
-
- return __sched_setscheduler(p, &attr, check, true);
-}
-/**
- * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
- * @p: the task in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
- *
- * Use sched_set_fifo(), read its comment.
- *
- * Return: 0 on success. An error code otherwise.
- *
- * NOTE that the task may be already dead.
- */
-int sched_setscheduler(struct task_struct *p, int policy,
- const struct sched_param *param)
-{
- return _sched_setscheduler(p, policy, param, true);
-}
-
-int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
-{
- return __sched_setscheduler(p, attr, true, true);
-}
-
-int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
-{
- return __sched_setscheduler(p, attr, false, true);
-}
-EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
-
-/**
- * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
- * @p: the task in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
- *
- * Just like sched_setscheduler, only don't bother checking if the
- * current context has permission. For example, this is needed in
- * stop_machine(): we create temporary high priority worker threads,
- * but our caller might not have that capability.
- *
- * Return: 0 on success. An error code otherwise.
- */
-int sched_setscheduler_nocheck(struct task_struct *p, int policy,
- const struct sched_param *param)
-{
- return _sched_setscheduler(p, policy, param, false);
-}
-
-/*
- * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
- * incapable of resource management, which is the one thing an OS really should
- * be doing.
- *
- * This is of course the reason it is limited to privileged users only.
- *
- * Worse still; it is fundamentally impossible to compose static priority
- * workloads. You cannot take two correctly working static prio workloads
- * and smash them together and still expect them to work.
- *
- * For this reason 'all' FIFO tasks the kernel creates are basically at:
- *
- * MAX_RT_PRIO / 2
- *
- * The administrator _MUST_ configure the system, the kernel simply doesn't
- * know enough information to make a sensible choice.
- */
-void sched_set_fifo(struct task_struct *p)
-{
- struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
- WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
-}
-EXPORT_SYMBOL_GPL(sched_set_fifo);
-
-/*
- * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
- */
-void sched_set_fifo_low(struct task_struct *p)
-{
- struct sched_param sp = { .sched_priority = 1 };
- WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
-}
-EXPORT_SYMBOL_GPL(sched_set_fifo_low);
-
-void sched_set_normal(struct task_struct *p, int nice)
-{
- struct sched_attr attr = {
- .sched_policy = SCHED_NORMAL,
- .sched_nice = nice,
- };
- WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
-}
-EXPORT_SYMBOL_GPL(sched_set_normal);
-
-static int
-do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
-{
- struct sched_param lparam;
-
- if (!param || pid < 0)
- return -EINVAL;
- if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
- return -EFAULT;
-
- CLASS(find_get_task, p)(pid);
- if (!p)
- return -ESRCH;
-
- return sched_setscheduler(p, policy, &lparam);
-}
-
-/*
- * Mimics kernel/events/core.c perf_copy_attr().
- */
-static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
-{
- u32 size;
- int ret;
-
- /* Zero the full structure, so that a short copy will be nice: */
- memset(attr, 0, sizeof(*attr));
-
- ret = get_user(size, &uattr->size);
- if (ret)
- return ret;
-
- /* ABI compatibility quirk: */
- if (!size)
- size = SCHED_ATTR_SIZE_VER0;
- if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
- goto err_size;
-
- ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
- if (ret) {
- if (ret == -E2BIG)
- goto err_size;
- return ret;
- }
-
- if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
- size < SCHED_ATTR_SIZE_VER1)
- return -EINVAL;
-
- /*
- * XXX: Do we want to be lenient like existing syscalls; or do we want
- * to be strict and return an error on out-of-bounds values?
- */
- attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
-
- return 0;
-
-err_size:
- put_user(sizeof(*attr), &uattr->size);
- return -E2BIG;
-}
-
-static void get_params(struct task_struct *p, struct sched_attr *attr)
-{
- if (task_has_dl_policy(p))
- __getparam_dl(p, attr);
- else if (task_has_rt_policy(p))
- attr->sched_priority = p->rt_priority;
- else
- attr->sched_nice = task_nice(p);
-}
-
-/**
- * sys_sched_setscheduler - set/change the scheduler policy and RT priority
- * @pid: the pid in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
- *
- * Return: 0 on success. An error code otherwise.
- */
-SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
-{
- if (policy < 0)
- return -EINVAL;
-
- return do_sched_setscheduler(pid, policy, param);
-}
-
-/**
- * sys_sched_setparam - set/change the RT priority of a thread
- * @pid: the pid in question.
- * @param: structure containing the new RT priority.
- *
- * Return: 0 on success. An error code otherwise.
- */
-SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
-{
- return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
-}
-
-/**
- * sys_sched_setattr - same as above, but with extended sched_attr
- * @pid: the pid in question.
- * @uattr: structure containing the extended parameters.
- * @flags: for future extension.
- */
-SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
- unsigned int, flags)
-{
- struct sched_attr attr;
- int retval;
-
- if (!uattr || pid < 0 || flags)
- return -EINVAL;
-
- retval = sched_copy_attr(uattr, &attr);
- if (retval)
- return retval;
-
- if ((int)attr.sched_policy < 0)
- return -EINVAL;
- if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
- attr.sched_policy = SETPARAM_POLICY;
-
- CLASS(find_get_task, p)(pid);
- if (!p)
- return -ESRCH;
-
- if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
- get_params(p, &attr);
-
- return sched_setattr(p, &attr);
-}
-
-/**
- * sys_sched_getscheduler - get the policy (scheduling class) of a thread
- * @pid: the pid in question.
- *
- * Return: On success, the policy of the thread. Otherwise, a negative error
- * code.
- */
-SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
-{
- struct task_struct *p;
- int retval;
-
- if (pid < 0)
- return -EINVAL;
-
- guard(rcu)();
- p = find_process_by_pid(pid);
- if (!p)
- return -ESRCH;
-
- retval = security_task_getscheduler(p);
- if (!retval) {
- retval = p->policy;
- if (p->sched_reset_on_fork)
- retval |= SCHED_RESET_ON_FORK;
- }
- return retval;
-}
-
-/**
- * sys_sched_getparam - get the RT priority of a thread
- * @pid: the pid in question.
- * @param: structure containing the RT priority.
- *
- * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
- * code.
- */
-SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
-{
- struct sched_param lp = { .sched_priority = 0 };
- struct task_struct *p;
- int retval;
-
- if (!param || pid < 0)
- return -EINVAL;
-
- scoped_guard (rcu) {
- p = find_process_by_pid(pid);
- if (!p)
- return -ESRCH;
-
- retval = security_task_getscheduler(p);
- if (retval)
- return retval;
-
- if (task_has_rt_policy(p))
- lp.sched_priority = p->rt_priority;
- }
-
- /*
- * This one might sleep, we cannot do it with a spinlock held ...
- */
- return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
-}
-
-/*
- * Copy the kernel size attribute structure (which might be larger
- * than what user-space knows about) to user-space.
- *
- * Note that all cases are valid: user-space buffer can be larger or
- * smaller than the kernel-space buffer. The usual case is that both
- * have the same size.
- */
-static int
-sched_attr_copy_to_user(struct sched_attr __user *uattr,
- struct sched_attr *kattr,
- unsigned int usize)
-{
- unsigned int ksize = sizeof(*kattr);
-
- if (!access_ok(uattr, usize))
- return -EFAULT;
-
- /*
- * sched_getattr() ABI forwards and backwards compatibility:
- *
- * If usize == ksize then we just copy everything to user-space and all is good.
- *
- * If usize < ksize then we only copy as much as user-space has space for,
- * this keeps ABI compatibility as well. We skip the rest.
- *
- * If usize > ksize then user-space is using a newer version of the ABI,
- * which part the kernel doesn't know about. Just ignore it - tooling can
- * detect the kernel's knowledge of attributes from the attr->size value
- * which is set to ksize in this case.
- */
- kattr->size = min(usize, ksize);
-
- if (copy_to_user(uattr, kattr, kattr->size))
- return -EFAULT;
-
- return 0;
-}
-
-/**
- * sys_sched_getattr - similar to sched_getparam, but with sched_attr
- * @pid: the pid in question.
- * @uattr: structure containing the extended parameters.
- * @usize: sizeof(attr) for fwd/bwd comp.
- * @flags: for future extension.
- */
-SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
- unsigned int, usize, unsigned int, flags)
-{
- struct sched_attr kattr = { };
- struct task_struct *p;
- int retval;
-
- if (!uattr || pid < 0 || usize > PAGE_SIZE ||
- usize < SCHED_ATTR_SIZE_VER0 || flags)
- return -EINVAL;
-
- scoped_guard (rcu) {
- p = find_process_by_pid(pid);
- if (!p)
- return -ESRCH;
-
- retval = security_task_getscheduler(p);
- if (retval)
- return retval;
-
- kattr.sched_policy = p->policy;
- if (p->sched_reset_on_fork)
- kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
- get_params(p, &kattr);
- kattr.sched_flags &= SCHED_FLAG_ALL;
-
-#ifdef CONFIG_UCLAMP_TASK
- /*
- * This could race with another potential updater, but this is fine
- * because it'll correctly read the old or the new value. We don't need
- * to guarantee who wins the race as long as it doesn't return garbage.
- */
- kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
- kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
-#endif
- }
-
- return sched_attr_copy_to_user(uattr, &kattr, usize);
-}
-
-#ifdef CONFIG_SMP
-int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
-{
- /*
- * If the task isn't a deadline task or admission control is
- * disabled then we don't care about affinity changes.
- */
- if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
- return 0;
-
- /*
- * Since bandwidth control happens on root_domain basis,
- * if admission test is enabled, we only admit -deadline
- * tasks allowed to run on all the CPUs in the task's
- * root_domain.
- */
- guard(rcu)();
- if (!cpumask_subset(task_rq(p)->rd->span, mask))
- return -EBUSY;
-
- return 0;
-}
-#endif
-
-static int
-__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
-{
- int retval;
- cpumask_var_t cpus_allowed, new_mask;
-
- if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
- return -ENOMEM;
-
- if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
- retval = -ENOMEM;
- goto out_free_cpus_allowed;
- }
-
- cpuset_cpus_allowed(p, cpus_allowed);
- cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
-
- ctx->new_mask = new_mask;
- ctx->flags |= SCA_CHECK;
-
- retval = dl_task_check_affinity(p, new_mask);
- if (retval)
- goto out_free_new_mask;
-
- retval = __set_cpus_allowed_ptr(p, ctx);
- if (retval)
- goto out_free_new_mask;
-
- cpuset_cpus_allowed(p, cpus_allowed);
- if (!cpumask_subset(new_mask, cpus_allowed)) {
- /*
- * We must have raced with a concurrent cpuset update.
- * Just reset the cpumask to the cpuset's cpus_allowed.
- */
- cpumask_copy(new_mask, cpus_allowed);
-
- /*
- * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
- * will restore the previous user_cpus_ptr value.
- *
- * In the unlikely event a previous user_cpus_ptr exists,
- * we need to further restrict the mask to what is allowed
- * by that old user_cpus_ptr.
- */
- if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
- bool empty = !cpumask_and(new_mask, new_mask,
- ctx->user_mask);
-
- if (WARN_ON_ONCE(empty))
- cpumask_copy(new_mask, cpus_allowed);
- }
- __set_cpus_allowed_ptr(p, ctx);
- retval = -EINVAL;
- }
-
-out_free_new_mask:
- free_cpumask_var(new_mask);
-out_free_cpus_allowed:
- free_cpumask_var(cpus_allowed);
- return retval;
-}
-
-long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
-{
- struct affinity_context ac;
- struct cpumask *user_mask;
- int retval;
-
- CLASS(find_get_task, p)(pid);
- if (!p)
- return -ESRCH;
-
- if (p->flags & PF_NO_SETAFFINITY)
- return -EINVAL;
-
- if (!check_same_owner(p)) {
- guard(rcu)();
- if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
- return -EPERM;
- }
-
- retval = security_task_setscheduler(p);
- if (retval)
- return retval;
-
- /*
- * With non-SMP configs, user_cpus_ptr/user_mask isn't used and
- * alloc_user_cpus_ptr() returns NULL.
- */
- user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
- if (user_mask) {
- cpumask_copy(user_mask, in_mask);
- } else if (IS_ENABLED(CONFIG_SMP)) {
- return -ENOMEM;
- }
-
- ac = (struct affinity_context){
- .new_mask = in_mask,
- .user_mask = user_mask,
- .flags = SCA_USER,
- };
-
- retval = __sched_setaffinity(p, &ac);
- kfree(ac.user_mask);
-
- return retval;
-}
-
-static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
- struct cpumask *new_mask)
-{
- if (len < cpumask_size())
- cpumask_clear(new_mask);
- else if (len > cpumask_size())
- len = cpumask_size();
-
- return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
-}
-
-/**
- * sys_sched_setaffinity - set the CPU affinity of a process
- * @pid: pid of the process
- * @len: length in bytes of the bitmask pointed to by user_mask_ptr
- * @user_mask_ptr: user-space pointer to the new CPU mask
- *
- * Return: 0 on success. An error code otherwise.
- */
-SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
- unsigned long __user *, user_mask_ptr)
-{
- cpumask_var_t new_mask;
- int retval;
-
- if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
- return -ENOMEM;
-
- retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
- if (retval == 0)
- retval = sched_setaffinity(pid, new_mask);
- free_cpumask_var(new_mask);
- return retval;
-}
-
-long sched_getaffinity(pid_t pid, struct cpumask *mask)
-{
- struct task_struct *p;
- int retval;
-
- guard(rcu)();
- p = find_process_by_pid(pid);
- if (!p)
- return -ESRCH;
-
- retval = security_task_getscheduler(p);
- if (retval)
- return retval;
-
- guard(raw_spinlock_irqsave)(&p->pi_lock);
- cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
-
- return 0;
-}
-
-/**
- * sys_sched_getaffinity - get the CPU affinity of a process
- * @pid: pid of the process
- * @len: length in bytes of the bitmask pointed to by user_mask_ptr
- * @user_mask_ptr: user-space pointer to hold the current CPU mask
- *
- * Return: size of CPU mask copied to user_mask_ptr on success. An
- * error code otherwise.
- */
-SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
- unsigned long __user *, user_mask_ptr)
-{
- int ret;
- cpumask_var_t mask;
-
- if ((len * BITS_PER_BYTE) < nr_cpu_ids)
- return -EINVAL;
- if (len & (sizeof(unsigned long)-1))
- return -EINVAL;
-
- if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
- return -ENOMEM;
-
- ret = sched_getaffinity(pid, mask);
- if (ret == 0) {
- unsigned int retlen = min(len, cpumask_size());
-
- if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen))
- ret = -EFAULT;
- else
- ret = retlen;
- }
- free_cpumask_var(mask);
-
- return ret;
-}
-
-static void do_sched_yield(void)
-{
- struct rq_flags rf;
- struct rq *rq;
-
- rq = this_rq_lock_irq(&rf);
-
- schedstat_inc(rq->yld_count);
- current->sched_class->yield_task(rq);
-
- preempt_disable();
- rq_unlock_irq(rq, &rf);
- sched_preempt_enable_no_resched();
-
- schedule();
-}
-
-/**
- * sys_sched_yield - yield the current processor to other threads.
- *
- * This function yields the current CPU to other tasks. If there are no
- * other threads running on this CPU then this function will return.
- *
- * Return: 0.
- */
-SYSCALL_DEFINE0(sched_yield)
-{
- do_sched_yield();
- return 0;
-}
-
#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
int __sched __cond_resched(void)
{
@@ -8904,105 +7356,11 @@ PREEMPT_MODEL_ACCESSOR(none);
PREEMPT_MODEL_ACCESSOR(voluntary);
PREEMPT_MODEL_ACCESSOR(full);
-#else /* !CONFIG_PREEMPT_DYNAMIC */
+#else /* !CONFIG_PREEMPT_DYNAMIC: */
static inline void preempt_dynamic_init(void) { }
-#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */
-
-/**
- * yield - yield the current processor to other threads.
- *
- * Do not ever use this function, there's a 99% chance you're doing it wrong.
- *
- * The scheduler is at all times free to pick the calling task as the most
- * eligible task to run, if removing the yield() call from your code breaks
- * it, it's already broken.
- *
- * Typical broken usage is:
- *
- * while (!event)
- * yield();
- *
- * where one assumes that yield() will let 'the other' process run that will
- * make event true. If the current task is a SCHED_FIFO task that will never
- * happen. Never use yield() as a progress guarantee!!
- *
- * If you want to use yield() to wait for something, use wait_event().
- * If you want to use yield() to be 'nice' for others, use cond_resched().
- * If you still want to use yield(), do not!
- */
-void __sched yield(void)
-{
- set_current_state(TASK_RUNNING);
- do_sched_yield();
-}
-EXPORT_SYMBOL(yield);
-
-/**
- * yield_to - yield the current processor to another thread in
- * your thread group, or accelerate that thread toward the
- * processor it's on.
- * @p: target task
- * @preempt: whether task preemption is allowed or not
- *
- * It's the caller's job to ensure that the target task struct
- * can't go away on us before we can do any checks.
- *
- * Return:
- * true (>0) if we indeed boosted the target task.
- * false (0) if we failed to boost the target.
- * -ESRCH if there's no task to yield to.
- */
-int __sched yield_to(struct task_struct *p, bool preempt)
-{
- struct task_struct *curr = current;
- struct rq *rq, *p_rq;
- int yielded = 0;
-
- scoped_guard (irqsave) {
- rq = this_rq();
-
-again:
- p_rq = task_rq(p);
- /*
- * If we're the only runnable task on the rq and target rq also
- * has only one task, there's absolutely no point in yielding.
- */
- if (rq->nr_running == 1 && p_rq->nr_running == 1)
- return -ESRCH;
-
- guard(double_rq_lock)(rq, p_rq);
- if (task_rq(p) != p_rq)
- goto again;
-
- if (!curr->sched_class->yield_to_task)
- return 0;
-
- if (curr->sched_class != p->sched_class)
- return 0;
-
- if (task_on_cpu(p_rq, p) || !task_is_running(p))
- return 0;
-
- yielded = curr->sched_class->yield_to_task(rq, p);
- if (yielded) {
- schedstat_inc(rq->yld_count);
- /*
- * Make p's CPU reschedule; pick_next_entity
- * takes care of fairness.
- */
- if (preempt && rq != p_rq)
- resched_curr(p_rq);
- }
- }
-
- if (yielded)
- schedule();
-
- return yielded;
-}
-EXPORT_SYMBOL_GPL(yield_to);
+#endif /* CONFIG_PREEMPT_DYNAMIC */
int io_schedule_prepare(void)
{
@@ -9045,123 +7403,6 @@ void __sched io_schedule(void)
}
EXPORT_SYMBOL(io_schedule);
-/**
- * sys_sched_get_priority_max - return maximum RT priority.
- * @policy: scheduling class.
- *
- * Return: On success, this syscall returns the maximum
- * rt_priority that can be used by a given scheduling class.
- * On failure, a negative error code is returned.
- */
-SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
-{
- int ret = -EINVAL;
-
- switch (policy) {
- case SCHED_FIFO:
- case SCHED_RR:
- ret = MAX_RT_PRIO-1;
- break;
- case SCHED_DEADLINE:
- case SCHED_NORMAL:
- case SCHED_BATCH:
- case SCHED_IDLE:
- ret = 0;
- break;
- }
- return ret;
-}
-
-/**
- * sys_sched_get_priority_min - return minimum RT priority.
- * @policy: scheduling class.
- *
- * Return: On success, this syscall returns the minimum
- * rt_priority that can be used by a given scheduling class.
- * On failure, a negative error code is returned.
- */
-SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
-{
- int ret = -EINVAL;
-
- switch (policy) {
- case SCHED_FIFO:
- case SCHED_RR:
- ret = 1;
- break;
- case SCHED_DEADLINE:
- case SCHED_NORMAL:
- case SCHED_BATCH:
- case SCHED_IDLE:
- ret = 0;
- }
- return ret;
-}
-
-static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
-{
- unsigned int time_slice = 0;
- int retval;
-
- if (pid < 0)
- return -EINVAL;
-
- scoped_guard (rcu) {
- struct task_struct *p = find_process_by_pid(pid);
- if (!p)
- return -ESRCH;
-
- retval = security_task_getscheduler(p);
- if (retval)
- return retval;
-
- scoped_guard (task_rq_lock, p) {
- struct rq *rq = scope.rq;
- if (p->sched_class->get_rr_interval)
- time_slice = p->sched_class->get_rr_interval(rq, p);
- }
- }
-
- jiffies_to_timespec64(time_slice, t);
- return 0;
-}
-
-/**
- * sys_sched_rr_get_interval - return the default timeslice of a process.
- * @pid: pid of the process.
- * @interval: userspace pointer to the timeslice value.
- *
- * this syscall writes the default timeslice value of a given process
- * into the user-space timespec buffer. A value of '0' means infinity.
- *
- * Return: On success, 0 and the timeslice is in @interval. Otherwise,
- * an error code.
- */
-SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
- struct __kernel_timespec __user *, interval)
-{
- struct timespec64 t;
- int retval = sched_rr_get_interval(pid, &t);
-
- if (retval == 0)
- retval = put_timespec64(&t, interval);
-
- return retval;
-}
-
-#ifdef CONFIG_COMPAT_32BIT_TIME
-SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
- struct old_timespec32 __user *, interval)
-{
- struct timespec64 t;
- int retval = sched_rr_get_interval(pid, &t);
-
- if (retval == 0)
- retval = put_old_timespec32(&t, interval);
- return retval;
-}
-#endif
-
void sched_show_task(struct task_struct *p)
{
unsigned long free = 0;
@@ -9729,7 +7970,7 @@ int sched_cpu_deactivate(unsigned int cpu)
* Specifically, we rely on ttwu to no longer target this CPU, see
* ttwu_queue_cond() and is_cpu_allowed().
*
- * Do sync before park smpboot threads to take care the rcu boost case.
+ * Do sync before park smpboot threads to take care the RCU boost case.
*/
synchronize_rcu();
@@ -9804,7 +8045,7 @@ int sched_cpu_wait_empty(unsigned int cpu)
* Since this CPU is going 'away' for a while, fold any nr_active delta we
* might have. Called from the CPU stopper task after ensuring that the
* stopper is the last running task on the CPU, so nr_active count is
- * stable. We need to take the teardown thread which is calling this into
+ * stable. We need to take the tear-down thread which is calling this into
* account, so we hand in adjust = 1 to the load calculation.
*
* Also see the comment "Global load-average calculations".
@@ -9998,7 +8239,7 @@ void __init sched_init(void)
/*
* How much CPU bandwidth does root_task_group get?
*
- * In case of task-groups formed thr' the cgroup filesystem, it
+ * In case of task-groups formed through the cgroup filesystem, it
* gets 100% of the CPU resources in the system. This overall
* system CPU resource is divided among the tasks of
* root_task_group and its child task-groups in a fair manner,
@@ -10300,7 +8541,7 @@ void normalize_rt_tasks(void)
#if defined(CONFIG_KGDB_KDB)
/*
- * These functions are only useful for kdb.
+ * These functions are only useful for KDB.
*
* They can only be called when the whole system has been
* stopped - every CPU needs to be quiescent, and no scheduling
@@ -10408,7 +8649,7 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
online_fair_sched_group(tg);
}
-/* rcu callback to free various structures associated with a task group */
+/* RCU callback to free various structures associated with a task group */
static void sched_unregister_group_rcu(struct rcu_head *rhp)
{
/* Now it should be safe to free those cfs_rqs: */
@@ -11526,10 +9767,10 @@ const int sched_prio_to_weight[40] = {
};
/*
- * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
+ * Inverse (2^32/x) values of the sched_prio_to_weight[] array, pre-calculated.
*
* In cases where the weight does not change often, we can use the
- * precalculated inverse to speed up arithmetics by turning divisions
+ * pre-calculated inverse to speed up arithmetics by turning divisions
* into multiplications:
*/
const u32 sched_prio_to_wmult[40] = {
@@ -11785,16 +10026,16 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
/*
* Move the src cid if the dst cid is unset. This keeps id
* allocation closest to 0 in cases where few threads migrate around
- * many cpus.
+ * many CPUs.
*
* If destination cid is already set, we may have to just clear
* the src cid to ensure compactness in frequent migrations
* scenarios.
*
* It is not useful to clear the src cid when the number of threads is
- * greater or equal to the number of allowed cpus, because user-space
+ * greater or equal to the number of allowed CPUs, because user-space
* can expect that the number of allowed cids can reach the number of
- * allowed cpus.
+ * allowed CPUs.
*/
dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq));
dst_cid = READ_ONCE(dst_pcpu_cid->cid);
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index a57fd8f27498..1ef98a93eb1d 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -279,7 +279,7 @@ void __sched_core_account_forceidle(struct rq *rq)
continue;
/*
- * Note: this will account forceidle to the current cpu, even
+ * Note: this will account forceidle to the current CPU, even
* if it comes from our SMT sibling.
*/
__account_forceidle_time(p, delta);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index aa48b2ec879d..a5e00293ae43 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -14,11 +14,11 @@
* They are only modified in vtime_account, on corresponding CPU
* with interrupts disabled. So, writes are safe.
* They are read and saved off onto struct rq in update_rq_clock().
- * This may result in other CPU reading this CPU's irq time and can
+ * This may result in other CPU reading this CPU's IRQ time and can
* race with irq/vtime_account on this CPU. We would either get old
- * or new value with a side effect of accounting a slice of irq time to wrong
- * task when irq is in progress while we read rq->clock. That is a worthy
- * compromise in place of having locks on each irq in account_system_time.
+ * or new value with a side effect of accounting a slice of IRQ time to wrong
+ * task when IRQ is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each IRQ in account_system_time.
*/
DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
@@ -269,7 +269,7 @@ static __always_inline u64 steal_account_process_time(u64 maxtime)
}
/*
- * Account how much elapsed time was spent in steal, irq, or softirq time.
+ * Account how much elapsed time was spent in steal, IRQ, or softirq time.
*/
static inline u64 account_other_time(u64 max)
{
@@ -370,7 +370,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
* Check for hardirq is done both for system and user time as there is
* no timer going off while we are on hardirq and hence we may never get an
* opportunity to update it solely in system time.
- * p->stime and friends are only updated on system time and not on irq
+ * p->stime and friends are only updated on system time and not on IRQ
* softirq as those do not count in task exec_runtime any more.
*/
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
@@ -380,7 +380,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
/*
* When returning from idle, many ticks can get accounted at
- * once, including some ticks of steal, irq, and softirq time.
+ * once, including some ticks of steal, IRQ, and softirq time.
* Subtract those ticks from the amount of time accounted to
* idle, or potentially user or system time. Due to rounding,
* other time can exceed ticks occasionally.
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index c75d1307d86d..f59e5c19d944 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -708,7 +708,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
}
/*
- * And we finally need to fixup root_domain(s) bandwidth accounting,
+ * And we finally need to fix up root_domain(s) bandwidth accounting,
* since p is still hanging out in the old (now moved to default) root
* domain.
*/
@@ -992,7 +992,7 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
* is detected, the runtime and deadline need to be updated.
*
* If the task has an implicit deadline, i.e., deadline == period, the Original
- * CBS is applied. the runtime is replenished and a new absolute deadline is
+ * CBS is applied. The runtime is replenished and a new absolute deadline is
* set, as in the previous cases.
*
* However, the Original CBS does not work properly for tasks with
@@ -1294,7 +1294,7 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
* Since rq->dl.running_bw and rq->dl.this_bw contain utilizations multiplied
* by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT.
* Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, dl_bw
- * is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
+ * is multiplied by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
* Since delta is a 64 bit variable, to have an overflow its value should be
* larger than 2^(64 - 20 - 8), which is more than 64 seconds. So, overflow is
* not an issue here.
@@ -1804,8 +1804,13 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* The replenish timer needs to be canceled. No
* problem if it fires concurrently: boosted threads
* are ignored in dl_task_timer().
+ *
+ * If the timer callback was running (hrtimer_try_to_cancel == -1),
+ * it will eventually call put_task_struct().
*/
- hrtimer_try_to_cancel(&p->dl.dl_timer);
+ if (hrtimer_try_to_cancel(&p->dl.dl_timer) == 1 &&
+ !dl_server(&p->dl))
+ put_task_struct(p);
p->dl.dl_throttled = 0;
}
} else if (!dl_prio(p->normal_prio)) {
@@ -2488,7 +2493,7 @@ static void pull_dl_task(struct rq *this_rq)
src_rq = cpu_rq(cpu);
/*
- * It looks racy, abd it is! However, as in sched_rt.c,
+ * It looks racy, and it is! However, as in sched_rt.c,
* we are fine with this.
*/
if (this_rq->dl.dl_nr_running &&
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8a5b1ae0aa55..9057584ec06d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -61,7 +61,7 @@
* Options are:
*
* SCHED_TUNABLESCALING_NONE - unscaled, always *1
- * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
+ * SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus)
* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
*
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
@@ -3835,15 +3835,14 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
}
}
-void reweight_task(struct task_struct *p, int prio)
+void reweight_task(struct task_struct *p, const struct load_weight *lw)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct load_weight *load = &se->load;
- unsigned long weight = scale_load(sched_prio_to_weight[prio]);
- reweight_entity(cfs_rq, se, weight);
- load->inv_weight = sched_prio_to_wmult[prio];
+ reweight_entity(cfs_rq, se, lw->weight);
+ load->inv_weight = lw->inv_weight;
}
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
@@ -8719,7 +8718,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
* topology where each level pairs two lower groups (or better). This results
* in O(log n) layers. Furthermore we reduce the number of CPUs going up the
* tree to only the first of the previous level and we decrease the frequency
- * of load-balance at each level inv. proportional to the number of CPUs in
+ * of load-balance at each level inversely proportional to the number of CPUs in
* the groups.
*
* This yields:
@@ -9149,12 +9148,8 @@ static int detach_tasks(struct lb_env *env)
break;
env->loop++;
- /*
- * We've more or less seen every task there is, call it quits
- * unless we haven't found any movable task yet.
- */
- if (env->loop > env->loop_max &&
- !(env->flags & LBF_ALL_PINNED))
+ /* We've more or less seen every task there is, call it quits */
+ if (env->loop > env->loop_max)
break;
/* take a breather every nr_migrate tasks */
@@ -11393,9 +11388,7 @@ more_balance:
if (env.flags & LBF_NEED_BREAK) {
env.flags &= ~LBF_NEED_BREAK;
- /* Stop if we tried all running tasks */
- if (env.loop < busiest->nr_running)
- goto more_balance;
+ goto more_balance;
}
/*
@@ -11892,6 +11885,13 @@ static void kick_ilb(unsigned int flags)
return;
/*
+ * Don't bother if no new NOHZ balance work items for ilb_cpu,
+ * i.e. all bits in flags are already set in ilb_cpu.
+ */
+ if ((atomic_read(nohz_flags(ilb_cpu)) & flags) == flags)
+ return;
+
+ /*
* Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
* the first flag owns it; cleared by nohz_csd_func().
*/
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 6135fbe83d68..6e78d071beb5 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -172,19 +172,13 @@ static void cpuidle_idle_call(void)
/*
* Check if the idle task must be rescheduled. If it is the
- * case, exit the function after re-enabling the local irq.
+ * case, exit the function after re-enabling the local IRQ.
*/
if (need_resched()) {
local_irq_enable();
return;
}
- /*
- * The RCU framework needs to be told that we are entering an idle
- * section, so no more rcu read side critical sections and one more
- * step to the grace period
- */
-
if (cpuidle_not_available(drv, dev)) {
tick_nohz_idle_stop_tick();
@@ -244,7 +238,7 @@ exit_idle:
__current_set_polling();
/*
- * It is up to the idle functions to reenable local interrupts
+ * It is up to the idle functions to re-enable local interrupts
*/
if (WARN_ON_ONCE(irqs_disabled()))
local_irq_enable();
@@ -320,7 +314,7 @@ static void do_idle(void)
rcu_nocb_flush_deferred_wakeup();
/*
- * In poll mode we reenable interrupts and spin. Also if we
+ * In poll mode we re-enable interrupts and spin. Also if we
* detected in the wakeup from idle path that the tick
* broadcast device expired for us, we don't want to go deep
* idle as we know that the IPI is going to arrive right away.
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index ca9da66cc894..c48900b856a2 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -45,7 +45,7 @@
* again, being late doesn't loose the delta, just wrecks the sample.
*
* - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because
- * this would add another cross-CPU cacheline miss and atomic operation
+ * this would add another cross-CPU cache-line miss and atomic operation
* to the wakeup path. Instead we increment on whatever CPU the task ran
* when it went into uninterruptible state and decrement on whatever CPU
* did the wakeup. This means that only the sum of nr_uninterruptible over
@@ -62,7 +62,7 @@ EXPORT_SYMBOL(avenrun); /* should be removed */
/**
* get_avenrun - get the load average array
- * @loads: pointer to dest load array
+ * @loads: pointer to destination load array
* @offset: offset to add
* @shift: shift count to shift the result left
*
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index ef00382de595..fa52906a4478 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -417,7 +417,7 @@ int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
/*
- * irq:
+ * IRQ:
*
* util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
* util_sum = cpu_scale * load_sum
@@ -432,7 +432,7 @@ int update_irq_load_avg(struct rq *rq, u64 running)
int ret = 0;
/*
- * We can't use clock_pelt because irq time is not accounted in
+ * We can't use clock_pelt because IRQ time is not accounted in
* clock_task. Instead we directly scale the running time to
* reflect the real amount of computation
*/
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 7b4aa5809c0f..020d58967d4e 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -41,7 +41,7 @@
* What it means for a task to be productive is defined differently
* for each resource. For IO, productive means a running task. For
* memory, productive means a running task that isn't a reclaimer. For
- * CPU, productive means an oncpu task.
+ * CPU, productive means an on-CPU task.
*
* Naturally, the FULL state doesn't exist for the CPU resource at the
* system level, but exist at the cgroup level. At the cgroup level,
@@ -49,7 +49,7 @@
* resource which is being used by others outside of the cgroup or
* throttled by the cgroup cpu.max configuration.
*
- * The percentage of wallclock time spent in those compound stall
+ * The percentage of wall clock time spent in those compound stall
* states gives pressure numbers between 0 and 100 for each resource,
* where the SOME percentage indicates workload slowdowns and the FULL
* percentage indicates reduced CPU utilization:
@@ -218,28 +218,32 @@ void __init psi_init(void)
group_init(&psi_system);
}
-static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu)
+static u32 test_states(unsigned int *tasks, u32 state_mask)
{
- switch (state) {
- case PSI_IO_SOME:
- return unlikely(tasks[NR_IOWAIT]);
- case PSI_IO_FULL:
- return unlikely(tasks[NR_IOWAIT] && !tasks[NR_RUNNING]);
- case PSI_MEM_SOME:
- return unlikely(tasks[NR_MEMSTALL]);
- case PSI_MEM_FULL:
- return unlikely(tasks[NR_MEMSTALL] &&
- tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
- case PSI_CPU_SOME:
- return unlikely(tasks[NR_RUNNING] > oncpu);
- case PSI_CPU_FULL:
- return unlikely(tasks[NR_RUNNING] && !oncpu);
- case PSI_NONIDLE:
- return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
- tasks[NR_RUNNING];
- default:
- return false;
+ const bool oncpu = state_mask & PSI_ONCPU;
+
+ if (tasks[NR_IOWAIT]) {
+ state_mask |= BIT(PSI_IO_SOME);
+ if (!tasks[NR_RUNNING])
+ state_mask |= BIT(PSI_IO_FULL);
+ }
+
+ if (tasks[NR_MEMSTALL]) {
+ state_mask |= BIT(PSI_MEM_SOME);
+ if (tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING])
+ state_mask |= BIT(PSI_MEM_FULL);
}
+
+ if (tasks[NR_RUNNING] > oncpu)
+ state_mask |= BIT(PSI_CPU_SOME);
+
+ if (tasks[NR_RUNNING] && !oncpu)
+ state_mask |= BIT(PSI_CPU_FULL);
+
+ if (tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING])
+ state_mask |= BIT(PSI_NONIDLE);
+
+ return state_mask;
}
static void get_recent_times(struct psi_group *group, int cpu,
@@ -345,7 +349,7 @@ static void collect_percpu_times(struct psi_group *group,
/*
* Collect the per-cpu time buckets and average them into a
- * single time sample that is normalized to wallclock time.
+ * single time sample that is normalized to wall clock time.
*
* For averaging, each CPU is weighted by its non-idle time in
* the sampling period. This eliminates artifacts from uneven
@@ -770,9 +774,9 @@ static void psi_group_change(struct psi_group *group, int cpu,
{
struct psi_group_cpu *groupc;
unsigned int t, m;
- enum psi_states s;
u32 state_mask;
+ lockdep_assert_rq_held(cpu_rq(cpu));
groupc = per_cpu_ptr(group->pcpu, cpu);
/*
@@ -841,10 +845,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
return;
}
- for (s = 0; s < NR_PSI_STATES; s++) {
- if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU))
- state_mask |= (1 << s);
- }
+ state_mask = test_states(groupc->tasks, state_mask);
/*
* Since we care about lost potential, a memstall is FULL
@@ -991,22 +992,32 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-void psi_account_irqtime(struct task_struct *task, u32 delta)
+void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
{
- int cpu = task_cpu(task);
+ int cpu = task_cpu(curr);
struct psi_group *group;
struct psi_group_cpu *groupc;
- u64 now;
+ u64 now, irq;
+ s64 delta;
if (static_branch_likely(&psi_disabled))
return;
- if (!task->pid)
+ if (!curr->pid)
+ return;
+
+ lockdep_assert_rq_held(rq);
+ group = task_psi_group(curr);
+ if (prev && task_psi_group(prev) == group)
return;
now = cpu_clock(cpu);
+ irq = irq_time_read(cpu);
+ delta = (s64)(irq - rq->psi_irq_time);
+ if (delta < 0)
+ return;
+ rq->psi_irq_time = irq;
- group = task_psi_group(task);
do {
if (!group->enabled)
continue;
@@ -1194,7 +1205,7 @@ void psi_cgroup_restart(struct psi_group *group)
/*
* After we disable psi_group->enabled, we don't actually
* stop percpu tasks accounting in each psi_group_cpu,
- * instead only stop test_state() loop, record_times()
+ * instead only stop test_states() loop, record_times()
* and averaging worker, see psi_group_change() for details.
*
* When disable cgroup PSI, this function has nothing to sync
@@ -1202,7 +1213,7 @@ void psi_cgroup_restart(struct psi_group *group)
* would see !psi_group->enabled and only do task accounting.
*
* When re-enable cgroup PSI, this function use psi_group_change()
- * to get correct state mask from test_state() loop on tasks[],
+ * to get correct state mask from test_states() loop on tasks[],
* and restart groupc->state_start from now, use .clear = .set = 0
* here since no task status really changed.
*/
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index aa4c1c874fa4..310523c1b9e3 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -26,9 +26,9 @@ int sysctl_sched_rt_runtime = 950000;
#ifdef CONFIG_SYSCTL
static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ;
-static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
+static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos);
-static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
+static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos);
static struct ctl_table sched_rt_sysctls[] = {
{
@@ -140,7 +140,7 @@ void init_rt_rq(struct rt_rq *rt_rq)
INIT_LIST_HEAD(array->queue + i);
__clear_bit(i, array->bitmap);
}
- /* delimiter for bitsearch: */
+ /* delimiter for bit-search: */
__set_bit(MAX_RT_PRIO, array->bitmap);
#if defined CONFIG_SMP
@@ -1135,7 +1135,7 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
/*
* This may have been our highest task, and therefore
- * we may have some recomputation to do
+ * we may have some re-computation to do
*/
if (prio == prev_prio) {
struct rt_prio_array *array = &rt_rq->active;
@@ -1571,7 +1571,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags)
*
* For equal prio tasks, we just let the scheduler sort it out.
*
- * Otherwise, just let it ride on the affined RQ and the
+ * Otherwise, just let it ride on the affine RQ and the
* post-schedule router will push the preempted task away
*
* This test is optimistic, if we get it wrong the load-balancer
@@ -2147,14 +2147,14 @@ static void push_rt_tasks(struct rq *rq)
* if its the only CPU with multiple RT tasks queued, and a large number
* of CPUs scheduling a lower priority task at the same time.
*
- * Each root domain has its own irq work function that can iterate over
+ * Each root domain has its own IRQ work function that can iterate over
* all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
* task must be checked if there's one or many CPUs that are lowering
- * their priority, there's a single irq work iterator that will try to
+ * their priority, there's a single IRQ work iterator that will try to
* push off RT tasks that are waiting to run.
*
* When a CPU schedules a lower priority task, it will kick off the
- * irq work iterator that will jump to each CPU with overloaded RT tasks.
+ * IRQ work iterator that will jump to each CPU with overloaded RT tasks.
* As it only takes the first CPU that schedules a lower priority task
* to start the process, the rto_start variable is incremented and if
* the atomic result is one, then that CPU will try to take the rto_lock.
@@ -2162,7 +2162,7 @@ static void push_rt_tasks(struct rq *rq)
* CPUs scheduling lower priority tasks.
*
* All CPUs that are scheduling a lower priority task will increment the
- * rt_loop_next variable. This will make sure that the irq work iterator
+ * rt_loop_next variable. This will make sure that the IRQ work iterator
* checks all RT overloaded CPUs whenever a CPU schedules a new lower
* priority task, even if the iterator is in the middle of a scan. Incrementing
* the rt_loop_next will cause the iterator to perform another scan.
@@ -2242,7 +2242,7 @@ static void tell_cpu_to_push(struct rq *rq)
* The rto_cpu is updated under the lock, if it has a valid CPU
* then the IPI is still running and will continue due to the
* update to loop_next, and nothing needs to be done here.
- * Otherwise it is finishing up and an ipi needs to be sent.
+ * Otherwise it is finishing up and an IPI needs to be sent.
*/
if (rq->rd->rto_cpu < 0)
cpu = rto_next_cpu(rq->rd);
@@ -2594,7 +2594,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
watchdog(rq, p);
/*
- * RR tasks need a special form of timeslice management.
+ * RR tasks need a special form of time-slice management.
* FIFO tasks have no timeslices.
*/
if (p->policy != SCHED_RR)
@@ -2900,7 +2900,7 @@ static int sched_rt_global_constraints(void)
int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
{
- /* Don't accept realtime tasks when there is no way for them to run */
+ /* Don't accept real-time tasks when there is no way for them to run */
if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
return 0;
@@ -2952,7 +2952,7 @@ static void sched_rt_do_global(void)
raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
}
-static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
+static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int old_period, old_runtime;
@@ -2991,7 +2991,7 @@ undo:
return ret;
}
-static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
+static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int ret;
@@ -3001,7 +3001,7 @@ static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
ret = proc_dointvec(table, write, buffer, lenp, ppos);
/*
* Make sure that internally we keep jiffies.
- * Also, writing zero resets the timeslice to default:
+ * Also, writing zero resets the time-slice to default:
*/
if (!ret && write) {
sched_rr_timeslice =
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a831af102070..4c36cc680361 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -74,6 +74,12 @@
#include "../workqueue_internal.h"
+struct rq;
+struct cfs_rq;
+struct rt_rq;
+struct sched_group;
+struct cpuidle_state;
+
#ifdef CONFIG_PARAVIRT
# include <asm/paravirt.h>
# include <asm/paravirt_api_clock.h>
@@ -90,9 +96,6 @@
# define SCHED_WARN_ON(x) ({ (void)(x), 0; })
#endif
-struct rq;
-struct cpuidle_state;
-
/* task_struct::on_rq states: */
#define TASK_ON_RQ_QUEUED 1
#define TASK_ON_RQ_MIGRATING 2
@@ -128,12 +131,12 @@ extern struct list_head asym_cap_list;
/*
* Helpers for converting nanosecond timing to jiffy resolution
*/
-#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
+#define NS_TO_JIFFIES(time) ((unsigned long)(time) / (NSEC_PER_SEC/HZ))
/*
* Increase resolution of nice-level calculations for 64-bit architectures.
* The extra resolution improves shares distribution and load balancing of
- * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
+ * low-weight task groups (eg. nice +19 on an autogroup), deeper task-group
* hierarchies, especially on larger systems. This is not a user-visible change
* and does not change the user-interface for setting shares/weights.
*
@@ -147,12 +150,13 @@ extern struct list_head asym_cap_list;
#ifdef CONFIG_64BIT
# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT)
-# define scale_load_down(w) \
-({ \
- unsigned long __w = (w); \
- if (__w) \
- __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \
- __w; \
+# define scale_load_down(w) \
+({ \
+ unsigned long __w = (w); \
+ \
+ if (__w) \
+ __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \
+ __w; \
})
#else
# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT)
@@ -187,6 +191,7 @@ static inline int idle_policy(int policy)
{
return policy == SCHED_IDLE;
}
+
static inline int fair_policy(int policy)
{
return policy == SCHED_NORMAL || policy == SCHED_BATCH;
@@ -201,6 +206,7 @@ static inline int dl_policy(int policy)
{
return policy == SCHED_DEADLINE;
}
+
static inline bool valid_policy(int policy)
{
return idle_policy(policy) || fair_policy(policy) ||
@@ -222,11 +228,12 @@ static inline int task_has_dl_policy(struct task_struct *p)
return dl_policy(p->policy);
}
-#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
static inline void update_avg(u64 *avg, u64 sample)
{
s64 diff = sample - *avg;
+
*avg += diff / 8;
}
@@ -251,7 +258,7 @@ static inline void update_avg(u64 *avg, u64 sample)
*/
#define SCHED_FLAG_SUGOV 0x10000000
-#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV)
+#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV)
static inline bool dl_entity_is_special(const struct sched_dl_entity *dl_se)
{
@@ -358,9 +365,6 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
#ifdef CONFIG_CGROUP_SCHED
-struct cfs_rq;
-struct rt_rq;
-
extern struct list_head task_groups;
struct cfs_bandwidth {
@@ -406,7 +410,7 @@ struct task_group {
#ifdef CONFIG_SMP
/*
* load_avg can be heavily contended at clock tick time, so put
- * it in its own cacheline separated from the fields above which
+ * it in its own cache-line separated from the fields above which
* will also be accessed at each tick.
*/
atomic_long_t load_avg ____cacheline_aligned;
@@ -536,6 +540,7 @@ static inline void set_task_rq_fair(struct sched_entity *se,
#else /* CONFIG_CGROUP_SCHED */
struct cfs_bandwidth { };
+
static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; }
#endif /* CONFIG_CGROUP_SCHED */
@@ -551,8 +556,8 @@ extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent
* applicable for 32-bits architectures.
*/
#ifdef CONFIG_64BIT
-# define u64_u32_load_copy(var, copy) var
-# define u64_u32_store_copy(var, copy, val) (var = val)
+# define u64_u32_load_copy(var, copy) var
+# define u64_u32_store_copy(var, copy, val) (var = val)
#else
# define u64_u32_load_copy(var, copy) \
({ \
@@ -580,8 +585,8 @@ do { \
copy = __val; \
} while (0)
#endif
-# define u64_u32_load(var) u64_u32_load_copy(var, var##_copy)
-# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val)
+# define u64_u32_load(var) u64_u32_load_copy(var, var##_copy)
+# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val)
/* CFS-related fields in a runqueue */
struct cfs_rq {
@@ -803,6 +808,7 @@ struct dl_rq {
};
#ifdef CONFIG_FAIR_GROUP_SCHED
+
/* An entity is a task if it doesn't "own" a runqueue */
#define entity_is_task(se) (!se->my_q)
@@ -820,16 +826,18 @@ static inline long se_runnable(struct sched_entity *se)
return se->runnable_weight;
}
-#else
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
+
#define entity_is_task(se) 1
-static inline void se_update_runnable(struct sched_entity *se) {}
+static inline void se_update_runnable(struct sched_entity *se) { }
static inline long se_runnable(struct sched_entity *se)
{
return !!se->on_rq;
}
-#endif
+
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_SMP
/*
@@ -874,7 +882,7 @@ struct root_domain {
*/
bool overloaded;
- /* Indicate one or more cpus over-utilized (tipping point) */
+ /* Indicate one or more CPUs over-utilized (tipping point) */
bool overutilized;
/*
@@ -988,7 +996,6 @@ struct uclamp_rq {
DECLARE_STATIC_KEY_FALSE(sched_uclamp_used);
#endif /* CONFIG_UCLAMP_TASK */
-struct rq;
struct balance_callback {
struct balance_callback *next;
void (*func)(struct rq *rq);
@@ -1126,6 +1133,7 @@ struct rq {
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
+ u64 psi_irq_time;
#endif
#ifdef CONFIG_PARAVIRT
u64 prev_steal_time;
@@ -1143,7 +1151,7 @@ struct rq {
call_single_data_t hrtick_csd;
#endif
struct hrtimer hrtick_timer;
- ktime_t hrtick_time;
+ ktime_t hrtick_time;
#endif
#ifdef CONFIG_SCHEDSTATS
@@ -1165,7 +1173,7 @@ struct rq {
#endif
#ifdef CONFIG_CPU_IDLE
- /* Must be inspected within a rcu lock section */
+ /* Must be inspected within a RCU lock section */
struct cpuidle_state *idle_state;
#endif
@@ -1227,7 +1235,7 @@ static inline int cpu_of(struct rq *rq)
#endif
}
-#define MDF_PUSH 0x01
+#define MDF_PUSH 0x01
static inline bool is_migration_disabled(struct task_struct *p)
{
@@ -1246,7 +1254,6 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() raw_cpu_ptr(&runqueues)
-struct sched_group;
#ifdef CONFIG_SCHED_CORE
static inline struct cpumask *sched_group_span(struct sched_group *sg);
@@ -1282,9 +1289,10 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
return &rq->__lock;
}
-bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
- bool fi);
-void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
+extern bool
+cfs_prio_less(const struct task_struct *a, const struct task_struct *b, bool fi);
+
+extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
/*
* Helpers to check if the CPU's core cookie matches with the task's cookie
@@ -1352,7 +1360,7 @@ extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
extern void sched_core_get(void);
extern void sched_core_put(void);
-#else /* !CONFIG_SCHED_CORE */
+#else /* !CONFIG_SCHED_CORE: */
static inline bool sched_core_enabled(struct rq *rq)
{
@@ -1390,7 +1398,8 @@ static inline bool sched_group_cookie_match(struct rq *rq,
{
return true;
}
-#endif /* CONFIG_SCHED_CORE */
+
+#endif /* !CONFIG_SCHED_CORE */
static inline void lockdep_assert_rq_held(struct rq *rq)
{
@@ -1421,8 +1430,10 @@ static inline void raw_spin_rq_unlock_irq(struct rq *rq)
static inline unsigned long _raw_spin_rq_lock_irqsave(struct rq *rq)
{
unsigned long flags;
+
local_irq_save(flags);
raw_spin_rq_lock(rq);
+
return flags;
}
@@ -1451,6 +1462,7 @@ static inline void update_idle_core(struct rq *rq) { }
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
+
static inline struct task_struct *task_of(struct sched_entity *se)
{
SCHED_WARN_ON(!entity_is_task(se));
@@ -1474,9 +1486,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return grp->my_q;
}
-#else
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
-#define task_of(_se) container_of(_se, struct task_struct, se)
+#define task_of(_se) container_of(_se, struct task_struct, se)
static inline struct cfs_rq *task_cfs_rq(const struct task_struct *p)
{
@@ -1496,7 +1508,8 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
{
return NULL;
}
-#endif
+
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
extern void update_rq_clock(struct rq *rq);
@@ -1622,9 +1635,9 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
#ifdef CONFIG_SCHED_DEBUG
rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
rf->clock_update_flags = 0;
-#ifdef CONFIG_SMP
+# ifdef CONFIG_SMP
SCHED_WARN_ON(rq->balance_callback && rq->balance_callback != &balance_push_callback);
-#endif
+# endif
#endif
}
@@ -1650,9 +1663,11 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
#endif
}
+extern
struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(rq->lock);
+extern
struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(p->pi_lock)
__acquires(rq->lock);
@@ -1679,48 +1694,42 @@ DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct,
task_rq_unlock(_T->rq, _T->lock, &_T->rf),
struct rq *rq; struct rq_flags rf)
-static inline void
-rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
+static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
raw_spin_rq_lock_irqsave(rq, rf->flags);
rq_pin_lock(rq, rf);
}
-static inline void
-rq_lock_irq(struct rq *rq, struct rq_flags *rf)
+static inline void rq_lock_irq(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
raw_spin_rq_lock_irq(rq);
rq_pin_lock(rq, rf);
}
-static inline void
-rq_lock(struct rq *rq, struct rq_flags *rf)
+static inline void rq_lock(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
raw_spin_rq_lock(rq);
rq_pin_lock(rq, rf);
}
-static inline void
-rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
+static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
raw_spin_rq_unlock_irqrestore(rq, rf->flags);
}
-static inline void
-rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
+static inline void rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
raw_spin_rq_unlock_irq(rq);
}
-static inline void
-rq_unlock(struct rq *rq, struct rq_flags *rf)
+static inline void rq_unlock(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
@@ -1742,8 +1751,7 @@ DEFINE_LOCK_GUARD_1(rq_lock_irqsave, struct rq,
rq_unlock_irqrestore(_T->lock, &_T->rf),
struct rq_flags rf)
-static inline struct rq *
-this_rq_lock_irq(struct rq_flags *rf)
+static inline struct rq *this_rq_lock_irq(struct rq_flags *rf)
__acquires(rq->lock)
{
struct rq *rq;
@@ -1751,15 +1759,18 @@ this_rq_lock_irq(struct rq_flags *rf)
local_irq_disable();
rq = this_rq();
rq_lock(rq, rf);
+
return rq;
}
#ifdef CONFIG_NUMA
+
enum numa_topology_type {
NUMA_DIRECT,
NUMA_GLUELESS_MESH,
NUMA_BACKPLANE,
};
+
extern enum numa_topology_type sched_numa_topology_type;
extern int sched_max_numa_distance;
extern bool find_numa_distance(int distance);
@@ -1768,18 +1779,23 @@ extern void sched_update_numa(int cpu, bool online);
extern void sched_domains_numa_masks_set(unsigned int cpu);
extern void sched_domains_numa_masks_clear(unsigned int cpu);
extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu);
-#else
+
+#else /* !CONFIG_NUMA: */
+
static inline void sched_init_numa(int offline_node) { }
static inline void sched_update_numa(int cpu, bool online) { }
static inline void sched_domains_numa_masks_set(unsigned int cpu) { }
static inline void sched_domains_numa_masks_clear(unsigned int cpu) { }
+
static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
{
return nr_cpu_ids;
}
-#endif
+
+#endif /* !CONFIG_NUMA */
#ifdef CONFIG_NUMA_BALANCING
+
/* The regions in numa_faults array from task_struct */
enum numa_faults_stats {
NUMA_MEM = 0,
@@ -1787,17 +1803,21 @@ enum numa_faults_stats {
NUMA_MEMBUF,
NUMA_CPUBUF
};
+
extern void sched_setnuma(struct task_struct *p, int node);
extern int migrate_task_to(struct task_struct *p, int cpu);
extern int migrate_swap(struct task_struct *p, struct task_struct *t,
int cpu, int scpu);
extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
-#else
+
+#else /* !CONFIG_NUMA_BALANCING: */
+
static inline void
init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
{
}
-#endif /* CONFIG_NUMA_BALANCING */
+
+#endif /* !CONFIG_NUMA_BALANCING */
#ifdef CONFIG_SMP
@@ -1822,8 +1842,7 @@ queue_balance_callback(struct rq *rq,
}
#define rcu_dereference_check_sched_domain(p) \
- rcu_dereference_check((p), \
- lockdep_is_held(&sched_domains_mutex))
+ rcu_dereference_check((p), lockdep_is_held(&sched_domains_mutex))
/*
* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
@@ -1894,6 +1913,7 @@ DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
+
extern struct static_key_false sched_asym_cpucapacity;
extern struct static_key_false sched_cluster_active;
@@ -1957,15 +1977,11 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg)
extern int group_balance_cpu(struct sched_group *sg);
#ifdef CONFIG_SCHED_DEBUG
-void update_sched_domain_debugfs(void);
-void dirty_sched_domain_sysctl(int cpu);
+extern void update_sched_domain_debugfs(void);
+extern void dirty_sched_domain_sysctl(int cpu);
#else
-static inline void update_sched_domain_debugfs(void)
-{
-}
-static inline void dirty_sched_domain_sysctl(int cpu)
-{
-}
+static inline void update_sched_domain_debugfs(void) { }
+static inline void dirty_sched_domain_sysctl(int cpu) { }
#endif
extern int sched_update_scaling(void);
@@ -1976,6 +1992,7 @@ static inline const struct cpumask *task_user_cpus(struct task_struct *p)
return cpu_possible_mask; /* &init_task.cpus_mask */
return p->user_cpus_ptr;
}
+
#endif /* CONFIG_SMP */
#include "stats.h"
@@ -1998,13 +2015,13 @@ static inline void sched_core_tick(struct rq *rq)
__sched_core_tick(rq);
}
-#else
+#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */
-static inline void sched_core_account_forceidle(struct rq *rq) {}
+static inline void sched_core_account_forceidle(struct rq *rq) { }
-static inline void sched_core_tick(struct rq *rq) {}
+static inline void sched_core_tick(struct rq *rq) { }
-#endif /* CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS */
+#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */
#ifdef CONFIG_CGROUP_SCHED
@@ -2046,15 +2063,16 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
#endif
}
-#else /* CONFIG_CGROUP_SCHED */
+#else /* !CONFIG_CGROUP_SCHED: */
static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+
static inline struct task_group *task_group(struct task_struct *p)
{
return NULL;
}
-#endif /* CONFIG_CGROUP_SCHED */
+#endif /* !CONFIG_CGROUP_SCHED */
static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
{
@@ -2099,6 +2117,7 @@ enum {
extern const_debug unsigned int sysctl_sched_features;
#ifdef CONFIG_JUMP_LABEL
+
#define SCHED_FEAT(name, enabled) \
static __always_inline bool static_branch_##name(struct static_key *key) \
{ \
@@ -2111,13 +2130,13 @@ static __always_inline bool static_branch_##name(struct static_key *key) \
extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
-#else /* !CONFIG_JUMP_LABEL */
+#else /* !CONFIG_JUMP_LABEL: */
#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
-#endif /* CONFIG_JUMP_LABEL */
+#endif /* !CONFIG_JUMP_LABEL */
-#else /* !SCHED_DEBUG */
+#else /* !SCHED_DEBUG: */
/*
* Each translation unit has its own copy of sysctl_sched_features to allow
@@ -2133,7 +2152,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features =
#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
-#endif /* SCHED_DEBUG */
+#endif /* !SCHED_DEBUG */
extern struct static_key_false sched_numa_balancing;
extern struct static_key_false sched_schedstats;
@@ -2176,13 +2195,13 @@ static inline int task_on_rq_migrating(struct task_struct *p)
}
/* Wake flags. The first three directly map to some SD flag value */
-#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */
-#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */
-#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */
+#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */
+#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */
+#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */
-#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */
-#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
-#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */
+#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */
+#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
+#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */
#ifdef CONFIG_SMP
static_assert(WF_EXEC == SD_BALANCE_EXEC);
@@ -2252,9 +2271,9 @@ extern const u32 sched_prio_to_wmult[40];
#define RETRY_TASK ((void *)-1UL)
struct affinity_context {
- const struct cpumask *new_mask;
- struct cpumask *user_mask;
- unsigned int flags;
+ const struct cpumask *new_mask;
+ struct cpumask *user_mask;
+ unsigned int flags;
};
extern s64 update_curr_common(struct rq *rq);
@@ -2402,8 +2421,19 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void sched_balance_trigger(struct rq *rq);
+extern int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx);
extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx);
+static inline cpumask_t *alloc_user_cpus_ptr(int node)
+{
+ /*
+ * See do_set_cpus_allowed() above for the rcu_head usage.
+ */
+ int size = max_t(int, cpumask_size(), sizeof(struct rcu_head));
+
+ return kmalloc_node(size, GFP_KERNEL, node);
+}
+
static inline struct task_struct *get_push_task(struct rq *rq)
{
struct task_struct *p = rq->curr;
@@ -2425,9 +2455,23 @@ static inline struct task_struct *get_push_task(struct rq *rq)
extern int push_cpu_stop(void *arg);
-#endif
+#else /* !CONFIG_SMP: */
+
+static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+ struct affinity_context *ctx)
+{
+ return set_cpus_allowed_ptr(p, ctx->new_mask);
+}
+
+static inline cpumask_t *alloc_user_cpus_ptr(int node)
+{
+ return NULL;
+}
+
+#endif /* !CONFIG_SMP */
#ifdef CONFIG_CPU_IDLE
+
static inline void idle_set_state(struct rq *rq,
struct cpuidle_state *idle_state)
{
@@ -2440,7 +2484,9 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
return rq->idle_state;
}
-#else
+
+#else /* !CONFIG_CPU_IDLE: */
+
static inline void idle_set_state(struct rq *rq,
struct cpuidle_state *idle_state)
{
@@ -2450,7 +2496,8 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
{
return NULL;
}
-#endif
+
+#endif /* !CONFIG_CPU_IDLE */
extern void schedule_idle(void);
asmlinkage void schedule_user(void);
@@ -2463,7 +2510,7 @@ extern void init_sched_dl_class(void);
extern void init_sched_rt_class(void);
extern void init_sched_fair_class(void);
-extern void reweight_task(struct task_struct *p, int prio);
+extern void reweight_task(struct task_struct *p, const struct load_weight *lw);
extern void resched_curr(struct rq *rq);
extern void resched_cpu(int cpu);
@@ -2479,7 +2526,8 @@ extern void init_dl_entity(struct sched_dl_entity *dl_se);
#define RATIO_SHIFT 8
#define MAX_BW_BITS (64 - BW_SHIFT)
#define MAX_BW ((1ULL << MAX_BW_BITS) - 1)
-unsigned long to_ratio(u64 period, u64 runtime);
+
+extern unsigned long to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
extern void post_init_entity_util_avg(struct task_struct *p);
@@ -2505,10 +2553,10 @@ static inline void sched_update_tick_dependency(struct rq *rq)
else
tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
}
-#else
+#else /* !CONFIG_NO_HZ_FULL: */
static inline int sched_tick_offload_init(void) { return 0; }
static inline void sched_update_tick_dependency(struct rq *rq) { }
-#endif
+#endif /* !CONFIG_NO_HZ_FULL */
static inline void add_nr_running(struct rq *rq, unsigned count)
{
@@ -2544,9 +2592,9 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
#ifdef CONFIG_PREEMPT_RT
-#define SCHED_NR_MIGRATE_BREAK 8
+# define SCHED_NR_MIGRATE_BREAK 8
#else
-#define SCHED_NR_MIGRATE_BREAK 32
+# define SCHED_NR_MIGRATE_BREAK 32
#endif
extern const_debug unsigned int sysctl_sched_nr_migrate;
@@ -2595,9 +2643,9 @@ static inline int hrtick_enabled_dl(struct rq *rq)
return hrtick_enabled(rq);
}
-void hrtick_start(struct rq *rq, u64 delay);
+extern void hrtick_start(struct rq *rq, u64 delay);
-#else
+#else /* !CONFIG_SCHED_HRTICK: */
static inline int hrtick_enabled_fair(struct rq *rq)
{
@@ -2614,13 +2662,10 @@ static inline int hrtick_enabled(struct rq *rq)
return 0;
}
-#endif /* CONFIG_SCHED_HRTICK */
+#endif /* !CONFIG_SCHED_HRTICK */
#ifndef arch_scale_freq_tick
-static __always_inline
-void arch_scale_freq_tick(void)
-{
-}
+static __always_inline void arch_scale_freq_tick(void) { }
#endif
#ifndef arch_scale_freq_capacity
@@ -2657,13 +2702,13 @@ static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2)
#endif
}
#else
-static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {}
+static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) { }
#endif
-#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \
-__DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \
-static inline class_##name##_t class_##name##_constructor(type *lock, type *lock2) \
-{ class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \
+#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \
+__DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \
+static inline class_##name##_t class_##name##_constructor(type *lock, type *lock2) \
+{ class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \
_lock; return _t; }
#ifdef CONFIG_SMP
@@ -2717,7 +2762,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
return 1;
}
-#else
+#else /* !CONFIG_PREEMPTION: */
/*
* Unfair double_lock_balance: Optimizes throughput at the expense of
* latency by eliminating extra atomic operations when the locks are
@@ -2748,7 +2793,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
return 1;
}
-#endif /* CONFIG_PREEMPTION */
+#endif /* !CONFIG_PREEMPTION */
/*
* double_lock_balance - lock the busiest runqueue, this_rq is locked already.
@@ -2824,9 +2869,10 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
extern void set_rq_online (struct rq *rq);
extern void set_rq_offline(struct rq *rq);
+
extern bool sched_smp_initialized;
-#else /* CONFIG_SMP */
+#else /* !CONFIG_SMP: */
/*
* double_rq_lock - safely lock two runqueues
@@ -2860,7 +2906,7 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__release(rq2->lock);
}
-#endif
+#endif /* !CONFIG_SMP */
DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq,
double_rq_lock(_T->lock, _T->lock2),
@@ -2881,16 +2927,15 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
extern void resched_latency_warn(int cpu, u64 latency);
-#ifdef CONFIG_NUMA_BALANCING
-extern void
-show_numa_stats(struct task_struct *p, struct seq_file *m);
+# ifdef CONFIG_NUMA_BALANCING
+extern void show_numa_stats(struct task_struct *p, struct seq_file *m);
extern void
print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
- unsigned long tpf, unsigned long gsf, unsigned long gpf);
-#endif /* CONFIG_NUMA_BALANCING */
-#else
-static inline void resched_latency_warn(int cpu, u64 latency) {}
-#endif /* CONFIG_SCHED_DEBUG */
+ unsigned long tpf, unsigned long gsf, unsigned long gpf);
+# endif /* CONFIG_NUMA_BALANCING */
+#else /* !CONFIG_SCHED_DEBUG: */
+static inline void resched_latency_warn(int cpu, u64 latency) { }
+#endif /* !CONFIG_SCHED_DEBUG */
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
extern void init_rt_rq(struct rt_rq *rt_rq);
@@ -2900,6 +2945,7 @@ extern void cfs_bandwidth_usage_inc(void);
extern void cfs_bandwidth_usage_dec(void);
#ifdef CONFIG_NO_HZ_COMMON
+
#define NOHZ_BALANCE_KICK_BIT 0
#define NOHZ_STATS_KICK_BIT 1
#define NOHZ_NEWILB_KICK_BIT 2
@@ -2914,14 +2960,14 @@ extern void cfs_bandwidth_usage_dec(void);
/* Update nohz.next_balance */
#define NOHZ_NEXT_KICK BIT(NOHZ_NEXT_KICK_BIT)
-#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK)
+#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK)
-#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
+#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
extern void nohz_balance_exit_idle(struct rq *rq);
-#else
+#else /* !CONFIG_NO_HZ_COMMON: */
static inline void nohz_balance_exit_idle(struct rq *rq) { }
-#endif
+#endif /* !CONFIG_NO_HZ_COMMON */
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
extern void nohz_run_idle_balance(int cpu);
@@ -2930,6 +2976,7 @@ static inline void nohz_run_idle_balance(int cpu) { }
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
struct irqtime {
u64 total;
u64 tick_delta;
@@ -2957,9 +3004,11 @@ static inline u64 irq_time_read(int cpu)
return total;
}
+
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
#ifdef CONFIG_CPU_FREQ
+
DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
/**
@@ -2993,9 +3042,9 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
if (data)
data->func(data, rq_clock(rq), flags);
}
-#else
-static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
-#endif /* CONFIG_CPU_FREQ */
+#else /* !CONFIG_CPU_FREQ: */
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { }
+#endif /* !CONFIG_CPU_FREQ */
#ifdef arch_scale_freq_capacity
# ifndef arch_scale_freq_invariant
@@ -3006,6 +3055,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif
#ifdef CONFIG_SMP
+
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
unsigned long *min,
unsigned long *max);
@@ -3048,9 +3098,11 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
{
return READ_ONCE(rq->avg_rt.util_avg);
}
-#endif
+
+#endif /* CONFIG_SMP */
#ifdef CONFIG_UCLAMP_TASK
+
unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
static inline unsigned long uclamp_rq_get(struct rq *rq,
@@ -3097,9 +3149,40 @@ static inline bool uclamp_is_used(void)
{
return static_branch_likely(&sched_uclamp_used);
}
-#else /* CONFIG_UCLAMP_TASK */
-static inline unsigned long uclamp_eff_value(struct task_struct *p,
- enum uclamp_id clamp_id)
+
+#define for_each_clamp_id(clamp_id) \
+ for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
+
+extern unsigned int sysctl_sched_uclamp_util_min_rt_default;
+
+
+static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
+{
+ if (clamp_id == UCLAMP_MIN)
+ return 0;
+ return SCHED_CAPACITY_SCALE;
+}
+
+/* Integer rounded range for each bucket */
+#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
+
+static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
+{
+ return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
+}
+
+static inline void
+uclamp_se_set(struct uclamp_se *uc_se, unsigned int value, bool user_defined)
+{
+ uc_se->value = value;
+ uc_se->bucket_id = uclamp_bucket_id(value);
+ uc_se->user_defined = user_defined;
+}
+
+#else /* !CONFIG_UCLAMP_TASK: */
+
+static inline unsigned long
+uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
{
if (clamp_id == UCLAMP_MIN)
return 0;
@@ -3114,8 +3197,8 @@ static inline bool uclamp_is_used(void)
return false;
}
-static inline unsigned long uclamp_rq_get(struct rq *rq,
- enum uclamp_id clamp_id)
+static inline unsigned long
+uclamp_rq_get(struct rq *rq, enum uclamp_id clamp_id)
{
if (clamp_id == UCLAMP_MIN)
return 0;
@@ -3123,8 +3206,8 @@ static inline unsigned long uclamp_rq_get(struct rq *rq,
return SCHED_CAPACITY_SCALE;
}
-static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id,
- unsigned int value)
+static inline void
+uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id, unsigned int value)
{
}
@@ -3132,9 +3215,11 @@ static inline bool uclamp_rq_is_idle(struct rq *rq)
{
return false;
}
-#endif /* CONFIG_UCLAMP_TASK */
+
+#endif /* !CONFIG_UCLAMP_TASK */
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
+
static inline unsigned long cpu_util_irq(struct rq *rq)
{
return READ_ONCE(rq->avg_irq.util_avg);
@@ -3149,7 +3234,9 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
return util;
}
-#else
+
+#else /* !CONFIG_HAVE_SCHED_AVG_IRQ: */
+
static inline unsigned long cpu_util_irq(struct rq *rq)
{
return 0;
@@ -3160,7 +3247,8 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
{
return util;
}
-#endif
+
+#endif /* !CONFIG_HAVE_SCHED_AVG_IRQ */
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
@@ -3178,11 +3266,13 @@ extern struct cpufreq_governor schedutil_gov;
#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
#define perf_domain_span(pd) NULL
+
static inline bool sched_energy_enabled(void) { return false; }
#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
#ifdef CONFIG_MEMBARRIER
+
/*
* The scheduler provides memory barriers required by membarrier between:
* - prior user-space memory accesses and store to rq->membarrier_state,
@@ -3204,13 +3294,16 @@ static inline void membarrier_switch_mm(struct rq *rq,
WRITE_ONCE(rq->membarrier_state, membarrier_state);
}
-#else
+
+#else /* !CONFIG_MEMBARRIER :*/
+
static inline void membarrier_switch_mm(struct rq *rq,
struct mm_struct *prev_mm,
struct mm_struct *next_mm)
{
}
-#endif
+
+#endif /* !CONFIG_MEMBARRIER */
#ifdef CONFIG_SMP
static inline bool is_per_cpu_kthread(struct task_struct *p)
@@ -3262,7 +3355,7 @@ static inline void __mm_cid_put(struct mm_struct *mm, int cid)
* be held to transition to other states.
*
* State transitions synchronized with cmpxchg or try_cmpxchg need to be
- * consistent across cpus, which prevents use of this_cpu_cmpxchg.
+ * consistent across CPUs, which prevents use of this_cpu_cmpxchg.
*/
static inline void mm_cid_put_lazy(struct task_struct *t)
{
@@ -3329,6 +3422,7 @@ static inline int __mm_cid_try_get(struct mm_struct *mm)
}
if (cpumask_test_and_set_cpu(cid, cpumask))
return -1;
+
return cid;
}
@@ -3393,6 +3487,7 @@ unlock:
raw_spin_unlock(&cid_lock);
end:
mm_cid_snapshot_time(rq, mm);
+
return cid;
}
@@ -3415,6 +3510,7 @@ static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm)
}
cid = __mm_cid_get(rq, mm);
__this_cpu_write(pcpu_cid->cid, cid);
+
return cid;
}
@@ -3469,15 +3565,68 @@ static inline void switch_mm_cid(struct rq *rq,
next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm);
}
-#else
+#else /* !CONFIG_SCHED_MM_CID: */
static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { }
static inline void sched_mm_cid_migrate_from(struct task_struct *t) { }
static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { }
static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
static inline void init_sched_mm_cid(struct task_struct *t) { }
-#endif
+#endif /* !CONFIG_SCHED_MM_CID */
extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
+#ifdef CONFIG_RT_MUTEXES
+
+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
+{
+ if (pi_task)
+ prio = min(prio, pi_task->prio);
+
+ return prio;
+}
+
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+ struct task_struct *pi_task = rt_mutex_get_top_task(p);
+
+ return __rt_effective_prio(pi_task, prio);
+}
+
+#else /* !CONFIG_RT_MUTEXES: */
+
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+ return prio;
+}
+
+#endif /* !CONFIG_RT_MUTEXES */
+
+extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi);
+extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
+extern void __setscheduler_prio(struct task_struct *p, int prio);
+extern void set_load_weight(struct task_struct *p, bool update_load);
+extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
+extern void dequeue_task(struct rq *rq, struct task_struct *p, int flags);
+
+extern void check_class_changed(struct rq *rq, struct task_struct *p,
+ const struct sched_class *prev_class,
+ int oldprio);
+
+#ifdef CONFIG_SMP
+extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
+extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
+#else
+
+static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
+{
+ return NULL;
+}
+
+static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)
+{
+}
+
+#endif
+
#endif /* _KERNEL_SCHED_SCHED_H */
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 38f3698f5e5b..237780aa3c53 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -110,8 +110,12 @@ __schedstats_from_se(struct sched_entity *se)
void psi_task_change(struct task_struct *task, int clear, int set);
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep);
-void psi_account_irqtime(struct task_struct *task, u32 delta);
-
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev);
+#else
+static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
+ struct task_struct *prev) {}
+#endif /*CONFIG_IRQ_TIME_ACCOUNTING */
/*
* PSI tracks state that persists across sleeps, such as iowaits and
* memory stalls. As a result, it has to distinguish between sleeps,
@@ -192,7 +196,8 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep) {}
-static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
+static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
+ struct task_struct *prev) {}
#endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO
@@ -219,7 +224,7 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t)
/*
* Called when a task finally hits the CPU. We can now calculate how
* long it was waiting to run. We also note when it began so that we
- * can keep stats on how long its timeslice is.
+ * can keep stats on how long its time-slice is.
*/
static void sched_info_arrive(struct rq *rq, struct task_struct *t)
{
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
new file mode 100644
index 000000000000..ae1b42775ef9
--- /dev/null
+++ b/kernel/sched/syscalls.c
@@ -0,0 +1,1699 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * kernel/sched/syscalls.c
+ *
+ * Core kernel scheduler syscalls related code
+ *
+ * Copyright (C) 1991-2002 Linus Torvalds
+ * Copyright (C) 1998-2024 Ingo Molnar, Red Hat
+ */
+#include <linux/sched.h>
+#include <linux/cpuset.h>
+#include <linux/sched/debug.h>
+
+#include <uapi/linux/sched/types.h>
+
+#include "sched.h"
+#include "autogroup.h"
+
+static inline int __normal_prio(int policy, int rt_prio, int nice)
+{
+ int prio;
+
+ if (dl_policy(policy))
+ prio = MAX_DL_PRIO - 1;
+ else if (rt_policy(policy))
+ prio = MAX_RT_PRIO - 1 - rt_prio;
+ else
+ prio = NICE_TO_PRIO(nice);
+
+ return prio;
+}
+
+/*
+ * Calculate the expected normal priority: i.e. priority
+ * without taking RT-inheritance into account. Might be
+ * boosted by interactivity modifiers. Changes upon fork,
+ * setprio syscalls, and whenever the interactivity
+ * estimator recalculates.
+ */
+static inline int normal_prio(struct task_struct *p)
+{
+ return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
+}
+
+/*
+ * Calculate the current priority, i.e. the priority
+ * taken into account by the scheduler. This value might
+ * be boosted by RT tasks, or might be boosted by
+ * interactivity modifiers. Will be RT if the task got
+ * RT-boosted. If not then it returns p->normal_prio.
+ */
+static int effective_prio(struct task_struct *p)
+{
+ p->normal_prio = normal_prio(p);
+ /*
+ * If we are RT tasks or we were boosted to RT priority,
+ * keep the priority unchanged. Otherwise, update priority
+ * to the normal priority:
+ */
+ if (!rt_prio(p->prio))
+ return p->normal_prio;
+ return p->prio;
+}
+
+void set_user_nice(struct task_struct *p, long nice)
+{
+ bool queued, running;
+ struct rq *rq;
+ int old_prio;
+
+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
+ return;
+ /*
+ * We have to be careful, if called from sys_setpriority(),
+ * the task might be in the middle of scheduling on another CPU.
+ */
+ CLASS(task_rq_lock, rq_guard)(p);
+ rq = rq_guard.rq;
+
+ update_rq_clock(rq);
+
+ /*
+ * The RT priorities are set via sched_setscheduler(), but we still
+ * allow the 'normal' nice value to be set - but as expected
+ * it won't have any effect on scheduling until the task is
+ * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
+ */
+ if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
+ p->static_prio = NICE_TO_PRIO(nice);
+ return;
+ }
+
+ queued = task_on_rq_queued(p);
+ running = task_current(rq, p);
+ if (queued)
+ dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
+ if (running)
+ put_prev_task(rq, p);
+
+ p->static_prio = NICE_TO_PRIO(nice);
+ set_load_weight(p, true);
+ old_prio = p->prio;
+ p->prio = effective_prio(p);
+
+ if (queued)
+ enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
+ if (running)
+ set_next_task(rq, p);
+
+ /*
+ * If the task increased its priority or is running and
+ * lowered its priority, then reschedule its CPU:
+ */
+ p->sched_class->prio_changed(rq, p, old_prio);
+}
+EXPORT_SYMBOL(set_user_nice);
+
+/*
+ * is_nice_reduction - check if nice value is an actual reduction
+ *
+ * Similar to can_nice() but does not perform a capability check.
+ *
+ * @p: task
+ * @nice: nice value
+ */
+static bool is_nice_reduction(const struct task_struct *p, const int nice)
+{
+ /* Convert nice value [19,-20] to rlimit style value [1,40]: */
+ int nice_rlim = nice_to_rlimit(nice);
+
+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
+}
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+ return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE);
+}
+
+#ifdef __ARCH_WANT_SYS_NICE
+
+/*
+ * sys_nice - change the priority of the current process.
+ * @increment: priority increment
+ *
+ * sys_setpriority is a more generic, but much slower function that
+ * does similar things.
+ */
+SYSCALL_DEFINE1(nice, int, increment)
+{
+ long nice, retval;
+
+ /*
+ * Setpriority might change our priority at the same moment.
+ * We don't have to worry. Conceptually one call occurs first
+ * and we have a single winner.
+ */
+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
+ nice = task_nice(current) + increment;
+
+ nice = clamp_val(nice, MIN_NICE, MAX_NICE);
+ if (increment < 0 && !can_nice(current, nice))
+ return -EPERM;
+
+ retval = security_task_setnice(current, nice);
+ if (retval)
+ return retval;
+
+ set_user_nice(current, nice);
+ return 0;
+}
+
+#endif
+
+/**
+ * task_prio - return the priority value of a given task.
+ * @p: the task in question.
+ *
+ * Return: The priority value as seen by users in /proc.
+ *
+ * sched policy return value kernel prio user prio/nice
+ *
+ * normal, batch, idle [0 ... 39] [100 ... 139] 0/[-20 ... 19]
+ * fifo, rr [-2 ... -100] [98 ... 0] [1 ... 99]
+ * deadline -101 -1 0
+ */
+int task_prio(const struct task_struct *p)
+{
+ return p->prio - MAX_RT_PRIO;
+}
+
+/**
+ * idle_cpu - is a given CPU idle currently?
+ * @cpu: the processor in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
+ */
+int idle_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (rq->curr != rq->idle)
+ return 0;
+
+ if (rq->nr_running)
+ return 0;
+
+#ifdef CONFIG_SMP
+ if (rq->ttwu_pending)
+ return 0;
+#endif
+
+ return 1;
+}
+
+/**
+ * available_idle_cpu - is a given CPU idle for enqueuing work.
+ * @cpu: the CPU in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
+ */
+int available_idle_cpu(int cpu)
+{
+ if (!idle_cpu(cpu))
+ return 0;
+
+ if (vcpu_is_preempted(cpu))
+ return 0;
+
+ return 1;
+}
+
+/**
+ * idle_task - return the idle task for a given CPU.
+ * @cpu: the processor in question.
+ *
+ * Return: The idle task for the CPU @cpu.
+ */
+struct task_struct *idle_task(int cpu)
+{
+ return cpu_rq(cpu)->idle;
+}
+
+#ifdef CONFIG_SCHED_CORE
+int sched_core_idle_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (sched_core_enabled(rq) && rq->curr == rq->idle)
+ return 1;
+
+ return idle_cpu(cpu);
+}
+
+#endif
+
+#ifdef CONFIG_SMP
+/*
+ * This function computes an effective utilization for the given CPU, to be
+ * used for frequency selection given the linear relation: f = u * f_max.
+ *
+ * The scheduler tracks the following metrics:
+ *
+ * cpu_util_{cfs,rt,dl,irq}()
+ * cpu_bw_dl()
+ *
+ * Where the cfs,rt and dl util numbers are tracked with the same metric and
+ * synchronized windows and are thus directly comparable.
+ *
+ * The cfs,rt,dl utilization are the running times measured with rq->clock_task
+ * which excludes things like IRQ and steal-time. These latter are then accrued
+ * in the IRQ utilization.
+ *
+ * The DL bandwidth number OTOH is not a measured metric but a value computed
+ * based on the task model parameters and gives the minimal utilization
+ * required to meet deadlines.
+ */
+unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long *min,
+ unsigned long *max)
+{
+ unsigned long util, irq, scale;
+ struct rq *rq = cpu_rq(cpu);
+
+ scale = arch_scale_cpu_capacity(cpu);
+
+ /*
+ * Early check to see if IRQ/steal time saturates the CPU, can be
+ * because of inaccuracies in how we track these -- see
+ * update_irq_load_avg().
+ */
+ irq = cpu_util_irq(rq);
+ if (unlikely(irq >= scale)) {
+ if (min)
+ *min = scale;
+ if (max)
+ *max = scale;
+ return scale;
+ }
+
+ if (min) {
+ /*
+ * The minimum utilization returns the highest level between:
+ * - the computed DL bandwidth needed with the IRQ pressure which
+ * steals time to the deadline task.
+ * - The minimum performance requirement for CFS and/or RT.
+ */
+ *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
+
+ /*
+ * When an RT task is runnable and uclamp is not used, we must
+ * ensure that the task will run at maximum compute capacity.
+ */
+ if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
+ *min = max(*min, scale);
+ }
+
+ /*
+ * Because the time spend on RT/DL tasks is visible as 'lost' time to
+ * CFS tasks and we use the same metric to track the effective
+ * utilization (PELT windows are synchronized) we can directly add them
+ * to obtain the CPU's actual utilization.
+ */
+ util = util_cfs + cpu_util_rt(rq);
+ util += cpu_util_dl(rq);
+
+ /*
+ * The maximum hint is a soft bandwidth requirement, which can be lower
+ * than the actual utilization because of uclamp_max requirements.
+ */
+ if (max)
+ *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
+
+ if (util >= scale)
+ return scale;
+
+ /*
+ * There is still idle time; further improve the number by using the
+ * IRQ metric. Because IRQ/steal time is hidden from the task clock we
+ * need to scale the task numbers:
+ *
+ * max - irq
+ * U' = irq + --------- * U
+ * max
+ */
+ util = scale_irq_capacity(util, irq, scale);
+ util += irq;
+
+ return min(scale, util);
+}
+
+unsigned long sched_cpu_util(int cpu)
+{
+ return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
+}
+#endif /* CONFIG_SMP */
+
+/**
+ * find_process_by_pid - find a process with a matching PID value.
+ * @pid: the pid in question.
+ *
+ * The task of @pid, if found. %NULL otherwise.
+ */
+static struct task_struct *find_process_by_pid(pid_t pid)
+{
+ return pid ? find_task_by_vpid(pid) : current;
+}
+
+static struct task_struct *find_get_task(pid_t pid)
+{
+ struct task_struct *p;
+ guard(rcu)();
+
+ p = find_process_by_pid(pid);
+ if (likely(p))
+ get_task_struct(p);
+
+ return p;
+}
+
+DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T),
+ find_get_task(pid), pid_t pid)
+
+/*
+ * sched_setparam() passes in -1 for its policy, to let the functions
+ * it calls know not to change it.
+ */
+#define SETPARAM_POLICY -1
+
+static void __setscheduler_params(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ int policy = attr->sched_policy;
+
+ if (policy == SETPARAM_POLICY)
+ policy = p->policy;
+
+ p->policy = policy;
+
+ if (dl_policy(policy))
+ __setparam_dl(p, attr);
+ else if (fair_policy(policy))
+ p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+
+ /*
+ * __sched_setscheduler() ensures attr->sched_priority == 0 when
+ * !rt_policy. Always setting this ensures that things like
+ * getparam()/getattr() don't report silly values for !rt tasks.
+ */
+ p->rt_priority = attr->sched_priority;
+ p->normal_prio = normal_prio(p);
+ set_load_weight(p, true);
+}
+
+/*
+ * Check the target process has a UID that matches the current process's:
+ */
+static bool check_same_owner(struct task_struct *p)
+{
+ const struct cred *cred = current_cred(), *pcred;
+ guard(rcu)();
+
+ pcred = __task_cred(p);
+ return (uid_eq(cred->euid, pcred->euid) ||
+ uid_eq(cred->euid, pcred->uid));
+}
+
+#ifdef CONFIG_UCLAMP_TASK
+
+static int uclamp_validate(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ int util_min = p->uclamp_req[UCLAMP_MIN].value;
+ int util_max = p->uclamp_req[UCLAMP_MAX].value;
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
+ util_min = attr->sched_util_min;
+
+ if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
+ return -EINVAL;
+ }
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
+ util_max = attr->sched_util_max;
+
+ if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
+ return -EINVAL;
+ }
+
+ if (util_min != -1 && util_max != -1 && util_min > util_max)
+ return -EINVAL;
+
+ /*
+ * We have valid uclamp attributes; make sure uclamp is enabled.
+ *
+ * We need to do that here, because enabling static branches is a
+ * blocking operation which obviously cannot be done while holding
+ * scheduler locks.
+ */
+ static_branch_enable(&sched_uclamp_used);
+
+ return 0;
+}
+
+static bool uclamp_reset(const struct sched_attr *attr,
+ enum uclamp_id clamp_id,
+ struct uclamp_se *uc_se)
+{
+ /* Reset on sched class change for a non user-defined clamp value. */
+ if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
+ !uc_se->user_defined)
+ return true;
+
+ /* Reset on sched_util_{min,max} == -1. */
+ if (clamp_id == UCLAMP_MIN &&
+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
+ attr->sched_util_min == -1) {
+ return true;
+ }
+
+ if (clamp_id == UCLAMP_MAX &&
+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
+ attr->sched_util_max == -1) {
+ return true;
+ }
+
+ return false;
+}
+
+static void __setscheduler_uclamp(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ enum uclamp_id clamp_id;
+
+ for_each_clamp_id(clamp_id) {
+ struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
+ unsigned int value;
+
+ if (!uclamp_reset(attr, clamp_id, uc_se))
+ continue;
+
+ /*
+ * RT by default have a 100% boost value that could be modified
+ * at runtime.
+ */
+ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
+ value = sysctl_sched_uclamp_util_min_rt_default;
+ else
+ value = uclamp_none(clamp_id);
+
+ uclamp_se_set(uc_se, value, false);
+
+ }
+
+ if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
+ return;
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
+ attr->sched_util_min != -1) {
+ uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
+ attr->sched_util_min, true);
+ }
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
+ attr->sched_util_max != -1) {
+ uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
+ attr->sched_util_max, true);
+ }
+}
+
+#else /* !CONFIG_UCLAMP_TASK: */
+
+static inline int uclamp_validate(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ return -EOPNOTSUPP;
+}
+static void __setscheduler_uclamp(struct task_struct *p,
+ const struct sched_attr *attr) { }
+#endif
+
+/*
+ * Allow unprivileged RT tasks to decrease priority.
+ * Only issue a capable test if needed and only once to avoid an audit
+ * event on permitted non-privileged operations:
+ */
+static int user_check_sched_setscheduler(struct task_struct *p,
+ const struct sched_attr *attr,
+ int policy, int reset_on_fork)
+{
+ if (fair_policy(policy)) {
+ if (attr->sched_nice < task_nice(p) &&
+ !is_nice_reduction(p, attr->sched_nice))
+ goto req_priv;
+ }
+
+ if (rt_policy(policy)) {
+ unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
+
+ /* Can't set/change the rt policy: */
+ if (policy != p->policy && !rlim_rtprio)
+ goto req_priv;
+
+ /* Can't increase priority: */
+ if (attr->sched_priority > p->rt_priority &&
+ attr->sched_priority > rlim_rtprio)
+ goto req_priv;
+ }
+
+ /*
+ * Can't set/change SCHED_DEADLINE policy at all for now
+ * (safest behavior); in the future we would like to allow
+ * unprivileged DL tasks to increase their relative deadline
+ * or reduce their runtime (both ways reducing utilization)
+ */
+ if (dl_policy(policy))
+ goto req_priv;
+
+ /*
+ * Treat SCHED_IDLE as nice 20. Only allow a switch to
+ * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
+ */
+ if (task_has_idle_policy(p) && !idle_policy(policy)) {
+ if (!is_nice_reduction(p, task_nice(p)))
+ goto req_priv;
+ }
+
+ /* Can't change other user's priorities: */
+ if (!check_same_owner(p))
+ goto req_priv;
+
+ /* Normal users shall not reset the sched_reset_on_fork flag: */
+ if (p->sched_reset_on_fork && !reset_on_fork)
+ goto req_priv;
+
+ return 0;
+
+req_priv:
+ if (!capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ return 0;
+}
+
+int __sched_setscheduler(struct task_struct *p,
+ const struct sched_attr *attr,
+ bool user, bool pi)
+{
+ int oldpolicy = -1, policy = attr->sched_policy;
+ int retval, oldprio, newprio, queued, running;
+ const struct sched_class *prev_class;
+ struct balance_callback *head;
+ struct rq_flags rf;
+ int reset_on_fork;
+ int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ struct rq *rq;
+ bool cpuset_locked = false;
+
+ /* The pi code expects interrupts enabled */
+ BUG_ON(pi && in_interrupt());
+recheck:
+ /* Double check policy once rq lock held: */
+ if (policy < 0) {
+ reset_on_fork = p->sched_reset_on_fork;
+ policy = oldpolicy = p->policy;
+ } else {
+ reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
+
+ if (!valid_policy(policy))
+ return -EINVAL;
+ }
+
+ if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
+ return -EINVAL;
+
+ /*
+ * Valid priorities for SCHED_FIFO and SCHED_RR are
+ * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL,
+ * SCHED_BATCH and SCHED_IDLE is 0.
+ */
+ if (attr->sched_priority > MAX_RT_PRIO-1)
+ return -EINVAL;
+ if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
+ (rt_policy(policy) != (attr->sched_priority != 0)))
+ return -EINVAL;
+
+ if (user) {
+ retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork);
+ if (retval)
+ return retval;
+
+ if (attr->sched_flags & SCHED_FLAG_SUGOV)
+ return -EINVAL;
+
+ retval = security_task_setscheduler(p);
+ if (retval)
+ return retval;
+ }
+
+ /* Update task specific "requested" clamps */
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
+ retval = uclamp_validate(p, attr);
+ if (retval)
+ return retval;
+ }
+
+ /*
+ * SCHED_DEADLINE bandwidth accounting relies on stable cpusets
+ * information.
+ */
+ if (dl_policy(policy) || dl_policy(p->policy)) {
+ cpuset_locked = true;
+ cpuset_lock();
+ }
+
+ /*
+ * Make sure no PI-waiters arrive (or leave) while we are
+ * changing the priority of the task:
+ *
+ * To be able to change p->policy safely, the appropriate
+ * runqueue lock must be held.
+ */
+ rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
+
+ /*
+ * Changing the policy of the stop threads its a very bad idea:
+ */
+ if (p == rq->stop) {
+ retval = -EINVAL;
+ goto unlock;
+ }
+
+ /*
+ * If not changing anything there's no need to proceed further,
+ * but store a possible modification of reset_on_fork.
+ */
+ if (unlikely(policy == p->policy)) {
+ if (fair_policy(policy) && attr->sched_nice != task_nice(p))
+ goto change;
+ if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
+ goto change;
+ if (dl_policy(policy) && dl_param_changed(p, attr))
+ goto change;
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
+ goto change;
+
+ p->sched_reset_on_fork = reset_on_fork;
+ retval = 0;
+ goto unlock;
+ }
+change:
+
+ if (user) {
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Do not allow real-time tasks into groups that have no runtime
+ * assigned.
+ */
+ if (rt_bandwidth_enabled() && rt_policy(policy) &&
+ task_group(p)->rt_bandwidth.rt_runtime == 0 &&
+ !task_group_is_autogroup(task_group(p))) {
+ retval = -EPERM;
+ goto unlock;
+ }
+#endif
+#ifdef CONFIG_SMP
+ if (dl_bandwidth_enabled() && dl_policy(policy) &&
+ !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
+ cpumask_t *span = rq->rd->span;
+
+ /*
+ * Don't allow tasks with an affinity mask smaller than
+ * the entire root_domain to become SCHED_DEADLINE. We
+ * will also fail if there's no bandwidth available.
+ */
+ if (!cpumask_subset(span, p->cpus_ptr) ||
+ rq->rd->dl_bw.bw == 0) {
+ retval = -EPERM;
+ goto unlock;
+ }
+ }
+#endif
+ }
+
+ /* Re-check policy now with rq lock held: */
+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
+ policy = oldpolicy = -1;
+ task_rq_unlock(rq, p, &rf);
+ if (cpuset_locked)
+ cpuset_unlock();
+ goto recheck;
+ }
+
+ /*
+ * If setscheduling to SCHED_DEADLINE (or changing the parameters
+ * of a SCHED_DEADLINE task) we need to check if enough bandwidth
+ * is available.
+ */
+ if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
+ retval = -EBUSY;
+ goto unlock;
+ }
+
+ p->sched_reset_on_fork = reset_on_fork;
+ oldprio = p->prio;
+
+ newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
+ if (pi) {
+ /*
+ * Take priority boosted tasks into account. If the new
+ * effective priority is unchanged, we just store the new
+ * normal parameters and do not touch the scheduler class and
+ * the runqueue. This will be done when the task deboost
+ * itself.
+ */
+ newprio = rt_effective_prio(p, newprio);
+ if (newprio == oldprio)
+ queue_flags &= ~DEQUEUE_MOVE;
+ }
+
+ queued = task_on_rq_queued(p);
+ running = task_current(rq, p);
+ if (queued)
+ dequeue_task(rq, p, queue_flags);
+ if (running)
+ put_prev_task(rq, p);
+
+ prev_class = p->sched_class;
+
+ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
+ __setscheduler_params(p, attr);
+ __setscheduler_prio(p, newprio);
+ }
+ __setscheduler_uclamp(p, attr);
+
+ if (queued) {
+ /*
+ * We enqueue to tail when the priority of a task is
+ * increased (user space view).
+ */
+ if (oldprio < p->prio)
+ queue_flags |= ENQUEUE_HEAD;
+
+ enqueue_task(rq, p, queue_flags);
+ }
+ if (running)
+ set_next_task(rq, p);
+
+ check_class_changed(rq, p, prev_class, oldprio);
+
+ /* Avoid rq from going away on us: */
+ preempt_disable();
+ head = splice_balance_callbacks(rq);
+ task_rq_unlock(rq, p, &rf);
+
+ if (pi) {
+ if (cpuset_locked)
+ cpuset_unlock();
+ rt_mutex_adjust_pi(p);
+ }
+
+ /* Run balance callbacks after we've adjusted the PI chain: */
+ balance_callbacks(rq, head);
+ preempt_enable();
+
+ return 0;
+
+unlock:
+ task_rq_unlock(rq, p, &rf);
+ if (cpuset_locked)
+ cpuset_unlock();
+ return retval;
+}
+
+static int _sched_setscheduler(struct task_struct *p, int policy,
+ const struct sched_param *param, bool check)
+{
+ struct sched_attr attr = {
+ .sched_policy = policy,
+ .sched_priority = param->sched_priority,
+ .sched_nice = PRIO_TO_NICE(p->static_prio),
+ };
+
+ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
+ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
+ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ policy &= ~SCHED_RESET_ON_FORK;
+ attr.sched_policy = policy;
+ }
+
+ return __sched_setscheduler(p, &attr, check, true);
+}
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Use sched_set_fifo(), read its comment.
+ *
+ * Return: 0 on success. An error code otherwise.
+ *
+ * NOTE that the task may be already dead.
+ */
+int sched_setscheduler(struct task_struct *p, int policy,
+ const struct sched_param *param)
+{
+ return _sched_setscheduler(p, policy, param, true);
+}
+
+int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
+{
+ return __sched_setscheduler(p, attr, true, true);
+}
+
+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
+{
+ return __sched_setscheduler(p, attr, false, true);
+}
+EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
+
+/**
+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernel-space.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Just like sched_setscheduler, only don't bother checking if the
+ * current context has permission. For example, this is needed in
+ * stop_machine(): we create temporary high priority worker threads,
+ * but our caller might not have that capability.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+int sched_setscheduler_nocheck(struct task_struct *p, int policy,
+ const struct sched_param *param)
+{
+ return _sched_setscheduler(p, policy, param, false);
+}
+
+/*
+ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
+ * incapable of resource management, which is the one thing an OS really should
+ * be doing.
+ *
+ * This is of course the reason it is limited to privileged users only.
+ *
+ * Worse still; it is fundamentally impossible to compose static priority
+ * workloads. You cannot take two correctly working static prio workloads
+ * and smash them together and still expect them to work.
+ *
+ * For this reason 'all' FIFO tasks the kernel creates are basically at:
+ *
+ * MAX_RT_PRIO / 2
+ *
+ * The administrator _MUST_ configure the system, the kernel simply doesn't
+ * know enough information to make a sensible choice.
+ */
+void sched_set_fifo(struct task_struct *p)
+{
+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_fifo);
+
+/*
+ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
+ */
+void sched_set_fifo_low(struct task_struct *p)
+{
+ struct sched_param sp = { .sched_priority = 1 };
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_fifo_low);
+
+void sched_set_normal(struct task_struct *p, int nice)
+{
+ struct sched_attr attr = {
+ .sched_policy = SCHED_NORMAL,
+ .sched_nice = nice,
+ };
+ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_normal);
+
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+{
+ struct sched_param lparam;
+
+ if (!param || pid < 0)
+ return -EINVAL;
+ if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
+ return -EFAULT;
+
+ CLASS(find_get_task, p)(pid);
+ if (!p)
+ return -ESRCH;
+
+ return sched_setscheduler(p, policy, &lparam);
+}
+
+/*
+ * Mimics kernel/events/core.c perf_copy_attr().
+ */
+static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
+{
+ u32 size;
+ int ret;
+
+ /* Zero the full structure, so that a short copy will be nice: */
+ memset(attr, 0, sizeof(*attr));
+
+ ret = get_user(size, &uattr->size);
+ if (ret)
+ return ret;
+
+ /* ABI compatibility quirk: */
+ if (!size)
+ size = SCHED_ATTR_SIZE_VER0;
+ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
+ goto err_size;
+
+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
+ if (ret) {
+ if (ret == -E2BIG)
+ goto err_size;
+ return ret;
+ }
+
+ if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
+ size < SCHED_ATTR_SIZE_VER1)
+ return -EINVAL;
+
+ /*
+ * XXX: Do we want to be lenient like existing syscalls; or do we want
+ * to be strict and return an error on out-of-bounds values?
+ */
+ attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
+
+ return 0;
+
+err_size:
+ put_user(sizeof(*attr), &uattr->size);
+ return -E2BIG;
+}
+
+static void get_params(struct task_struct *p, struct sched_attr *attr)
+{
+ if (task_has_dl_policy(p))
+ __getparam_dl(p, attr);
+ else if (task_has_rt_policy(p))
+ attr->sched_priority = p->rt_priority;
+ else
+ attr->sched_nice = task_nice(p);
+}
+
+/**
+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority
+ * @pid: the pid in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
+{
+ if (policy < 0)
+ return -EINVAL;
+
+ return do_sched_setscheduler(pid, policy, param);
+}
+
+/**
+ * sys_sched_setparam - set/change the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
+{
+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
+}
+
+/**
+ * sys_sched_setattr - same as above, but with extended sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @flags: for future extension.
+ */
+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, flags)
+{
+ struct sched_attr attr;
+ int retval;
+
+ if (!uattr || pid < 0 || flags)
+ return -EINVAL;
+
+ retval = sched_copy_attr(uattr, &attr);
+ if (retval)
+ return retval;
+
+ if ((int)attr.sched_policy < 0)
+ return -EINVAL;
+ if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
+ attr.sched_policy = SETPARAM_POLICY;
+
+ CLASS(find_get_task, p)(pid);
+ if (!p)
+ return -ESRCH;
+
+ if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
+ get_params(p, &attr);
+
+ return sched_setattr(p, &attr);
+}
+
+/**
+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread
+ * @pid: the pid in question.
+ *
+ * Return: On success, the policy of the thread. Otherwise, a negative error
+ * code.
+ */
+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
+{
+ struct task_struct *p;
+ int retval;
+
+ if (pid < 0)
+ return -EINVAL;
+
+ guard(rcu)();
+ p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
+
+ retval = security_task_getscheduler(p);
+ if (!retval) {
+ retval = p->policy;
+ if (p->sched_reset_on_fork)
+ retval |= SCHED_RESET_ON_FORK;
+ }
+ return retval;
+}
+
+/**
+ * sys_sched_getparam - get the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the RT priority.
+ *
+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
+ * code.
+ */
+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
+{
+ struct sched_param lp = { .sched_priority = 0 };
+ struct task_struct *p;
+ int retval;
+
+ if (!param || pid < 0)
+ return -EINVAL;
+
+ scoped_guard (rcu) {
+ p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
+
+ if (task_has_rt_policy(p))
+ lp.sched_priority = p->rt_priority;
+ }
+
+ /*
+ * This one might sleep, we cannot do it with a spinlock held ...
+ */
+ return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
+}
+
+/*
+ * Copy the kernel size attribute structure (which might be larger
+ * than what user-space knows about) to user-space.
+ *
+ * Note that all cases are valid: user-space buffer can be larger or
+ * smaller than the kernel-space buffer. The usual case is that both
+ * have the same size.
+ */
+static int
+sched_attr_copy_to_user(struct sched_attr __user *uattr,
+ struct sched_attr *kattr,
+ unsigned int usize)
+{
+ unsigned int ksize = sizeof(*kattr);
+
+ if (!access_ok(uattr, usize))
+ return -EFAULT;
+
+ /*
+ * sched_getattr() ABI forwards and backwards compatibility:
+ *
+ * If usize == ksize then we just copy everything to user-space and all is good.
+ *
+ * If usize < ksize then we only copy as much as user-space has space for,
+ * this keeps ABI compatibility as well. We skip the rest.
+ *
+ * If usize > ksize then user-space is using a newer version of the ABI,
+ * which part the kernel doesn't know about. Just ignore it - tooling can
+ * detect the kernel's knowledge of attributes from the attr->size value
+ * which is set to ksize in this case.
+ */
+ kattr->size = min(usize, ksize);
+
+ if (copy_to_user(uattr, kattr, kattr->size))
+ return -EFAULT;
+
+ return 0;
+}
+
+/**
+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @usize: sizeof(attr) for fwd/bwd comp.
+ * @flags: for future extension.
+ */
+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, usize, unsigned int, flags)
+{
+ struct sched_attr kattr = { };
+ struct task_struct *p;
+ int retval;
+
+ if (!uattr || pid < 0 || usize > PAGE_SIZE ||
+ usize < SCHED_ATTR_SIZE_VER0 || flags)
+ return -EINVAL;
+
+ scoped_guard (rcu) {
+ p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
+
+ kattr.sched_policy = p->policy;
+ if (p->sched_reset_on_fork)
+ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ get_params(p, &kattr);
+ kattr.sched_flags &= SCHED_FLAG_ALL;
+
+#ifdef CONFIG_UCLAMP_TASK
+ /*
+ * This could race with another potential updater, but this is fine
+ * because it'll correctly read the old or the new value. We don't need
+ * to guarantee who wins the race as long as it doesn't return garbage.
+ */
+ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+#endif
+ }
+
+ return sched_attr_copy_to_user(uattr, &kattr, usize);
+}
+
+#ifdef CONFIG_SMP
+int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
+{
+ /*
+ * If the task isn't a deadline task or admission control is
+ * disabled then we don't care about affinity changes.
+ */
+ if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
+ return 0;
+
+ /*
+ * Since bandwidth control happens on root_domain basis,
+ * if admission test is enabled, we only admit -deadline
+ * tasks allowed to run on all the CPUs in the task's
+ * root_domain.
+ */
+ guard(rcu)();
+ if (!cpumask_subset(task_rq(p)->rd->span, mask))
+ return -EBUSY;
+
+ return 0;
+}
+#endif /* CONFIG_SMP */
+
+int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
+{
+ int retval;
+ cpumask_var_t cpus_allowed, new_mask;
+
+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+ retval = -ENOMEM;
+ goto out_free_cpus_allowed;
+ }
+
+ cpuset_cpus_allowed(p, cpus_allowed);
+ cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
+
+ ctx->new_mask = new_mask;
+ ctx->flags |= SCA_CHECK;
+
+ retval = dl_task_check_affinity(p, new_mask);
+ if (retval)
+ goto out_free_new_mask;
+
+ retval = __set_cpus_allowed_ptr(p, ctx);
+ if (retval)
+ goto out_free_new_mask;
+
+ cpuset_cpus_allowed(p, cpus_allowed);
+ if (!cpumask_subset(new_mask, cpus_allowed)) {
+ /*
+ * We must have raced with a concurrent cpuset update.
+ * Just reset the cpumask to the cpuset's cpus_allowed.
+ */
+ cpumask_copy(new_mask, cpus_allowed);
+
+ /*
+ * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
+ * will restore the previous user_cpus_ptr value.
+ *
+ * In the unlikely event a previous user_cpus_ptr exists,
+ * we need to further restrict the mask to what is allowed
+ * by that old user_cpus_ptr.
+ */
+ if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
+ bool empty = !cpumask_and(new_mask, new_mask,
+ ctx->user_mask);
+
+ if (WARN_ON_ONCE(empty))
+ cpumask_copy(new_mask, cpus_allowed);
+ }
+ __set_cpus_allowed_ptr(p, ctx);
+ retval = -EINVAL;
+ }
+
+out_free_new_mask:
+ free_cpumask_var(new_mask);
+out_free_cpus_allowed:
+ free_cpumask_var(cpus_allowed);
+ return retval;
+}
+
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+{
+ struct affinity_context ac;
+ struct cpumask *user_mask;
+ int retval;
+
+ CLASS(find_get_task, p)(pid);
+ if (!p)
+ return -ESRCH;
+
+ if (p->flags & PF_NO_SETAFFINITY)
+ return -EINVAL;
+
+ if (!check_same_owner(p)) {
+ guard(rcu)();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
+ return -EPERM;
+ }
+
+ retval = security_task_setscheduler(p);
+ if (retval)
+ return retval;
+
+ /*
+ * With non-SMP configs, user_cpus_ptr/user_mask isn't used and
+ * alloc_user_cpus_ptr() returns NULL.
+ */
+ user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
+ if (user_mask) {
+ cpumask_copy(user_mask, in_mask);
+ } else if (IS_ENABLED(CONFIG_SMP)) {
+ return -ENOMEM;
+ }
+
+ ac = (struct affinity_context){
+ .new_mask = in_mask,
+ .user_mask = user_mask,
+ .flags = SCA_USER,
+ };
+
+ retval = __sched_setaffinity(p, &ac);
+ kfree(ac.user_mask);
+
+ return retval;
+}
+
+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
+ struct cpumask *new_mask)
+{
+ if (len < cpumask_size())
+ cpumask_clear(new_mask);
+ else if (len > cpumask_size())
+ len = cpumask_size();
+
+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
+}
+
+/**
+ * sys_sched_setaffinity - set the CPU affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to the new CPU mask
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
+ unsigned long __user *, user_mask_ptr)
+{
+ cpumask_var_t new_mask;
+ int retval;
+
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
+ if (retval == 0)
+ retval = sched_setaffinity(pid, new_mask);
+ free_cpumask_var(new_mask);
+ return retval;
+}
+
+long sched_getaffinity(pid_t pid, struct cpumask *mask)
+{
+ struct task_struct *p;
+ int retval;
+
+ guard(rcu)();
+ p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
+
+ guard(raw_spinlock_irqsave)(&p->pi_lock);
+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
+
+ return 0;
+}
+
+/**
+ * sys_sched_getaffinity - get the CPU affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to hold the current CPU mask
+ *
+ * Return: size of CPU mask copied to user_mask_ptr on success. An
+ * error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
+ unsigned long __user *, user_mask_ptr)
+{
+ int ret;
+ cpumask_var_t mask;
+
+ if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+ return -EINVAL;
+ if (len & (sizeof(unsigned long)-1))
+ return -EINVAL;
+
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = sched_getaffinity(pid, mask);
+ if (ret == 0) {
+ unsigned int retlen = min(len, cpumask_size());
+
+ if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen))
+ ret = -EFAULT;
+ else
+ ret = retlen;
+ }
+ free_cpumask_var(mask);
+
+ return ret;
+}
+
+static void do_sched_yield(void)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = this_rq_lock_irq(&rf);
+
+ schedstat_inc(rq->yld_count);
+ current->sched_class->yield_task(rq);
+
+ preempt_disable();
+ rq_unlock_irq(rq, &rf);
+ sched_preempt_enable_no_resched();
+
+ schedule();
+}
+
+/**
+ * sys_sched_yield - yield the current processor to other threads.
+ *
+ * This function yields the current CPU to other tasks. If there are no
+ * other threads running on this CPU then this function will return.
+ *
+ * Return: 0.
+ */
+SYSCALL_DEFINE0(sched_yield)
+{
+ do_sched_yield();
+ return 0;
+}
+
+/**
+ * yield - yield the current processor to other threads.
+ *
+ * Do not ever use this function, there's a 99% chance you're doing it wrong.
+ *
+ * The scheduler is at all times free to pick the calling task as the most
+ * eligible task to run, if removing the yield() call from your code breaks
+ * it, it's already broken.
+ *
+ * Typical broken usage is:
+ *
+ * while (!event)
+ * yield();
+ *
+ * where one assumes that yield() will let 'the other' process run that will
+ * make event true. If the current task is a SCHED_FIFO task that will never
+ * happen. Never use yield() as a progress guarantee!!
+ *
+ * If you want to use yield() to wait for something, use wait_event().
+ * If you want to use yield() to be 'nice' for others, use cond_resched().
+ * If you still want to use yield(), do not!
+ */
+void __sched yield(void)
+{
+ set_current_state(TASK_RUNNING);
+ do_sched_yield();
+}
+EXPORT_SYMBOL(yield);
+
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ * @p: target task
+ * @preempt: whether task preemption is allowed or not
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Return:
+ * true (>0) if we indeed boosted the target task.
+ * false (0) if we failed to boost the target.
+ * -ESRCH if there's no task to yield to.
+ */
+int __sched yield_to(struct task_struct *p, bool preempt)
+{
+ struct task_struct *curr = current;
+ struct rq *rq, *p_rq;
+ int yielded = 0;
+
+ scoped_guard (irqsave) {
+ rq = this_rq();
+
+again:
+ p_rq = task_rq(p);
+ /*
+ * If we're the only runnable task on the rq and target rq also
+ * has only one task, there's absolutely no point in yielding.
+ */
+ if (rq->nr_running == 1 && p_rq->nr_running == 1)
+ return -ESRCH;
+
+ guard(double_rq_lock)(rq, p_rq);
+ if (task_rq(p) != p_rq)
+ goto again;
+
+ if (!curr->sched_class->yield_to_task)
+ return 0;
+
+ if (curr->sched_class != p->sched_class)
+ return 0;
+
+ if (task_on_cpu(p_rq, p) || !task_is_running(p))
+ return 0;
+
+ yielded = curr->sched_class->yield_to_task(rq, p);
+ if (yielded) {
+ schedstat_inc(rq->yld_count);
+ /*
+ * Make p's CPU reschedule; pick_next_entity
+ * takes care of fairness.
+ */
+ if (preempt && rq != p_rq)
+ resched_curr(p_rq);
+ }
+ }
+
+ if (yielded)
+ schedule();
+
+ return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
+/**
+ * sys_sched_get_priority_max - return maximum RT priority.
+ * @policy: scheduling class.
+ *
+ * Return: On success, this syscall returns the maximum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
+ */
+SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
+{
+ int ret = -EINVAL;
+
+ switch (policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ ret = MAX_RT_PRIO-1;
+ break;
+ case SCHED_DEADLINE:
+ case SCHED_NORMAL:
+ case SCHED_BATCH:
+ case SCHED_IDLE:
+ ret = 0;
+ break;
+ }
+ return ret;
+}
+
+/**
+ * sys_sched_get_priority_min - return minimum RT priority.
+ * @policy: scheduling class.
+ *
+ * Return: On success, this syscall returns the minimum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
+ */
+SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
+{
+ int ret = -EINVAL;
+
+ switch (policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ ret = 1;
+ break;
+ case SCHED_DEADLINE:
+ case SCHED_NORMAL:
+ case SCHED_BATCH:
+ case SCHED_IDLE:
+ ret = 0;
+ }
+ return ret;
+}
+
+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
+{
+ unsigned int time_slice = 0;
+ int retval;
+
+ if (pid < 0)
+ return -EINVAL;
+
+ scoped_guard (rcu) {
+ struct task_struct *p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
+
+ scoped_guard (task_rq_lock, p) {
+ struct rq *rq = scope.rq;
+ if (p->sched_class->get_rr_interval)
+ time_slice = p->sched_class->get_rr_interval(rq, p);
+ }
+ }
+
+ jiffies_to_timespec64(time_slice, t);
+ return 0;
+}
+
+/**
+ * sys_sched_rr_get_interval - return the default time-slice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the time-slice value.
+ *
+ * this syscall writes the default time-slice value of a given process
+ * into the user-space timespec buffer. A value of '0' means infinity.
+ *
+ * Return: On success, 0 and the time-slice is in @interval. Otherwise,
+ * an error code.
+ */
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+ struct __kernel_timespec __user *, interval)
+{
+ struct timespec64 t;
+ int retval = sched_rr_get_interval(pid, &t);
+
+ if (retval == 0)
+ retval = put_timespec64(&t, interval);
+
+ return retval;
+}
+
+#ifdef CONFIG_COMPAT_32BIT_TIME
+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
+ struct old_timespec32 __user *, interval)
+{
+ struct timespec64 t;
+ int retval = sched_rr_get_interval(pid, &t);
+
+ if (retval == 0)
+ retval = put_old_timespec32(&t, interval);
+ return retval;
+}
+#endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index a6994a1fcc90..76504b776d03 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -285,7 +285,7 @@ void rebuild_sched_domains_energy(void)
}
#ifdef CONFIG_PROC_SYSCTL
-static int sched_energy_aware_handler(struct ctl_table *table, int write,
+static int sched_energy_aware_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret, state;
@@ -501,7 +501,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
cpumask_clear_cpu(rq->cpu, old_rd->span);
/*
- * If we dont want to free the old_rd yet then
+ * If we don't want to free the old_rd yet then
* set old_rd to NULL to skip the freeing later
* in this function:
*/
@@ -1176,7 +1176,7 @@ fail:
* uniquely identify each group (for a given domain):
*
* - The first is the balance_cpu (see should_we_balance() and the
- * load-balance blub in fair.c); for each group we only want 1 CPU to
+ * load-balance blurb in fair.c); for each group we only want 1 CPU to
* continue balancing at a higher domain.
*
* - The second is the sched_group_capacity; we want all identical groups
@@ -1388,7 +1388,7 @@ static inline void asym_cpu_capacity_update_data(int cpu)
/*
* Search if capacity already exits. If not, track which the entry
- * where we should insert to keep the list ordered descendingly.
+ * where we should insert to keep the list ordered descending.
*/
list_for_each_entry(entry, &asym_cap_list, link) {
if (capacity == entry->capacity)
@@ -1853,7 +1853,7 @@ void sched_init_numa(int offline_node)
struct cpumask ***masks;
/*
- * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+ * O(nr_nodes^2) de-duplicating selection sort -- in order to find the
* unique distances in the node_distance() table.
*/
distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
@@ -2750,7 +2750,7 @@ match2:
}
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
- /* Build perf. domains: */
+ /* Build perf domains: */
for (i = 0; i < ndoms_new; i++) {
for (j = 0; j < n && !sched_energy_update; j++) {
if (cpumask_equal(doms_new[i], doms_cur[j]) &&
@@ -2759,7 +2759,7 @@ match2:
goto match3;
}
}
- /* No match - add perf. domains for a new rd */
+ /* No match - add perf domains for a new rd */
has_eas |= build_perf_domains(doms_new[i]);
match3:
;
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 0b1cd985dc27..134d7112ef71 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -33,7 +33,7 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync
EXPORT_SYMBOL(wake_bit_function);
/*
- * To allow interruptible waiting and asynchronous (i.e. nonblocking)
+ * To allow interruptible waiting and asynchronous (i.e. non-blocking)
* waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
* permitted return codes. Nonzero return codes halt waiting and return.
*/
@@ -133,7 +133,7 @@ EXPORT_SYMBOL(__wake_up_bit);
* @bit: the bit of the word being waited on
*
* There is a standard hashed waitqueue table for generic use. This
- * is the part of the hashtable's accessor API that wakes up waiters
+ * is the part of the hash-table's accessor API that wakes up waiters
* on a bit. For instance, if one were to have waiters on a bitflag,
* one would call wake_up_bit() after clearing the bit.
*
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e30b60b57614..385d48293a5f 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -502,6 +502,9 @@ static inline pid_t seccomp_can_sync_threads(void)
/* Skip current, since it is initiating the sync. */
if (thread == caller)
continue;
+ /* Skip exited threads. */
+ if (thread->flags & PF_EXITING)
+ continue;
if (thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
(thread->seccomp.mode == SECCOMP_MODE_FILTER &&
@@ -563,18 +566,21 @@ static void __seccomp_filter_release(struct seccomp_filter *orig)
* @tsk: task the filter should be released from.
*
* This function should only be called when the task is exiting as
- * it detaches it from its filter tree. As such, READ_ONCE() and
- * barriers are not needed here, as would normally be needed.
+ * it detaches it from its filter tree. PF_EXITING has to be set
+ * for the task.
*/
void seccomp_filter_release(struct task_struct *tsk)
{
- struct seccomp_filter *orig = tsk->seccomp.filter;
+ struct seccomp_filter *orig;
- /* We are effectively holding the siglock by not having any sighand. */
- WARN_ON(tsk->sighand != NULL);
+ if (WARN_ON((tsk->flags & PF_EXITING) == 0))
+ return;
+ spin_lock_irq(&tsk->sighand->siglock);
+ orig = tsk->seccomp.filter;
/* Detach task from its filter tree. */
tsk->seccomp.filter = NULL;
+ spin_unlock_irq(&tsk->sighand->siglock);
__seccomp_filter_release(orig);
}
@@ -602,6 +608,13 @@ static inline void seccomp_sync_threads(unsigned long flags)
if (thread == caller)
continue;
+ /*
+ * Skip exited threads. seccomp_filter_release could have
+ * been already called for this task.
+ */
+ if (thread->flags & PF_EXITING)
+ continue;
+
/* Get a task reference for the new leaf node. */
get_seccomp_filter(caller);
@@ -1466,7 +1479,7 @@ static int recv_wake_function(wait_queue_entry_t *wait, unsigned int mode, int s
void *key)
{
/* Avoid a wakeup if event not interesting for us. */
- if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR)))
+ if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR | EPOLLHUP)))
return 0;
return autoremove_wake_function(wait, mode, sync, key);
}
@@ -1476,6 +1489,9 @@ static int recv_wait_event(struct seccomp_filter *filter)
DEFINE_WAIT_FUNC(wait, recv_wake_function);
int ret;
+ if (refcount_read(&filter->users) == 0)
+ return 0;
+
if (atomic_dec_if_positive(&filter->notif->requests) >= 0)
return 0;
@@ -1484,6 +1500,8 @@ static int recv_wait_event(struct seccomp_filter *filter)
if (atomic_dec_if_positive(&filter->notif->requests) >= 0)
break;
+ if (refcount_read(&filter->users) == 0)
+ break;
if (ret)
return ret;
@@ -2413,7 +2431,7 @@ static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged,
return audit_seccomp_actions_logged(new, old, !ret);
}
-static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
+static int seccomp_actions_logged_handler(const struct ctl_table *ro_table, int write,
void *buffer, size_t *lenp,
loff_t *ppos)
{
diff --git a/kernel/signal.c b/kernel/signal.c
index 1f9dd41c04be..60c737e423a1 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2600,6 +2600,14 @@ static void do_freezer_trap(void)
spin_unlock_irq(&current->sighand->siglock);
cgroup_enter_frozen();
schedule();
+
+ /*
+ * We could've been woken by task_work, run it to clear
+ * TIF_NOTIFY_SIGNAL. The caller will retry if necessary.
+ */
+ clear_notify_signal();
+ if (unlikely(task_work_pending(current)))
+ task_work_run();
}
static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type)
diff --git a/kernel/smp.c b/kernel/smp.c
index f085ebcdf9e7..aaffecdad319 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -25,6 +25,7 @@
#include <linux/nmi.h>
#include <linux/sched/debug.h>
#include <linux/jump_label.h>
+#include <linux/string_choices.h>
#include <trace/events/ipi.h>
#define CREATE_TRACE_POINTS
@@ -982,8 +983,7 @@ void __init smp_init(void)
num_nodes = num_online_nodes();
num_cpus = num_online_cpus();
pr_info("Brought up %d node%s, %d CPU%s\n",
- num_nodes, (num_nodes > 1 ? "s" : ""),
- num_cpus, (num_cpus > 1 ? "s" : ""));
+ num_nodes, str_plural(num_nodes), num_cpus, str_plural(num_cpus));
/* Any cleanup work */
smp_cpus_done(setup_max_cpus);
@@ -1119,6 +1119,7 @@ int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
queue_work_on(cpu, system_wq, &sscs.work);
wait_for_completion(&sscs.done);
+ destroy_work_on_stack(&sscs.work);
return sscs.ret;
}
diff --git a/kernel/stackleak.c b/kernel/stackleak.c
index 0f9712584913..39fd620a7db6 100644
--- a/kernel/stackleak.c
+++ b/kernel/stackleak.c
@@ -21,7 +21,7 @@
static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass);
#ifdef CONFIG_SYSCTL
-static int stack_erasing_sysctl(struct ctl_table *table, int write,
+static int stack_erasing_sysctl(const struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
int ret = 0;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index b696b85ac63e..c00a86931f8c 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -76,8 +76,6 @@ COND_SYSCALL(timerfd_gettime32);
COND_SYSCALL(acct);
COND_SYSCALL(capget);
COND_SYSCALL(capset);
-/* __ARCH_WANT_SYS_CLONE3 */
-COND_SYSCALL(clone3);
COND_SYSCALL(futex);
COND_SYSCALL(futex_time32);
COND_SYSCALL(set_robust_list);
@@ -392,3 +390,5 @@ COND_SYSCALL(setuid16);
/* restartable sequence */
COND_SYSCALL(rseq);
+
+COND_SYSCALL(uretprobe);
diff --git a/kernel/sysctl-test.c b/kernel/sysctl-test.c
index 6ef887c19c48..3ac98bb7fb82 100644
--- a/kernel/sysctl-test.c
+++ b/kernel/sysctl-test.c
@@ -367,6 +367,54 @@ static void sysctl_test_api_dointvec_write_single_greater_int_max(
KUNIT_EXPECT_EQ(test, 0, *((int *)table.data));
}
+/*
+ * Test that registering an invalid extra value is not allowed.
+ */
+static void sysctl_test_register_sysctl_sz_invalid_extra_value(
+ struct kunit *test)
+{
+ unsigned char data = 0;
+ struct ctl_table table_foo[] = {
+ {
+ .procname = "foo",
+ .data = &data,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_FOUR,
+ .extra2 = SYSCTL_ONE_THOUSAND,
+ },
+ };
+
+ struct ctl_table table_bar[] = {
+ {
+ .procname = "bar",
+ .data = &data,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_NEG_ONE,
+ .extra2 = SYSCTL_ONE_HUNDRED,
+ },
+ };
+
+ struct ctl_table table_qux[] = {
+ {
+ .procname = "qux",
+ .data = &data,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO_HUNDRED,
+ },
+ };
+
+ KUNIT_EXPECT_NULL(test, register_sysctl("foo", table_foo));
+ KUNIT_EXPECT_NULL(test, register_sysctl("foo", table_bar));
+ KUNIT_EXPECT_NOT_NULL(test, register_sysctl("foo", table_qux));
+}
+
static struct kunit_case sysctl_test_cases[] = {
KUNIT_CASE(sysctl_test_api_dointvec_null_tbl_data),
KUNIT_CASE(sysctl_test_api_dointvec_table_maxlen_unset),
@@ -378,6 +426,7 @@ static struct kunit_case sysctl_test_cases[] = {
KUNIT_CASE(sysctl_test_dointvec_write_happy_single_negative),
KUNIT_CASE(sysctl_test_api_dointvec_write_single_less_int_min),
KUNIT_CASE(sysctl_test_api_dointvec_write_single_greater_int_max),
+ KUNIT_CASE(sysctl_test_register_sysctl_sz_invalid_extra_value),
{}
};
@@ -388,4 +437,5 @@ static struct kunit_suite sysctl_test_suite = {
kunit_test_suites(&sysctl_test_suite);
+MODULE_DESCRIPTION("KUnit test of proc sysctl");
MODULE_LICENSE("GPL v2");
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e0b917328cf9..79e6cb1d5c48 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -205,7 +205,7 @@ static int _proc_do_string(char *data, int maxlen, int write,
return 0;
}
-static void warn_sysctl_write(struct ctl_table *table)
+static void warn_sysctl_write(const struct ctl_table *table)
{
pr_warn_once("%s wrote to %s when file position was not 0!\n"
"This will not be supported in the future. To silence this\n"
@@ -223,7 +223,7 @@ static void warn_sysctl_write(struct ctl_table *table)
* handlers can ignore the return value.
*/
static bool proc_first_pos_non_zero_ignore(loff_t *ppos,
- struct ctl_table *table)
+ const struct ctl_table *table)
{
if (!*ppos)
return false;
@@ -256,7 +256,7 @@ static bool proc_first_pos_non_zero_ignore(loff_t *ppos,
*
* Returns 0 on success.
*/
-int proc_dostring(struct ctl_table *table, int write,
+int proc_dostring(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
if (write)
@@ -468,7 +468,7 @@ static int do_proc_douintvec_conv(unsigned long *lvalp,
static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
-static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
+static int __do_proc_dointvec(void *tbl_data, const struct ctl_table *table,
int write, void *buffer,
size_t *lenp, loff_t *ppos,
int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
@@ -541,7 +541,7 @@ out:
return err;
}
-static int do_proc_dointvec(struct ctl_table *table, int write,
+static int do_proc_dointvec(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos,
int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
int write, void *data),
@@ -552,7 +552,7 @@ static int do_proc_dointvec(struct ctl_table *table, int write,
}
static int do_proc_douintvec_w(unsigned int *tbl_data,
- struct ctl_table *table,
+ const struct ctl_table *table,
void *buffer,
size_t *lenp, loff_t *ppos,
int (*conv)(unsigned long *lvalp,
@@ -639,7 +639,7 @@ out:
return err;
}
-static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table,
+static int __do_proc_douintvec(void *tbl_data, const struct ctl_table *table,
int write, void *buffer,
size_t *lenp, loff_t *ppos,
int (*conv)(unsigned long *lvalp,
@@ -675,7 +675,7 @@ static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table,
return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data);
}
-int do_proc_douintvec(struct ctl_table *table, int write,
+int do_proc_douintvec(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos,
int (*conv)(unsigned long *lvalp,
unsigned int *valp,
@@ -702,7 +702,7 @@ int do_proc_douintvec(struct ctl_table *table, int write,
*
* Returns 0 on success.
*/
-int proc_dobool(struct ctl_table *table, int write, void *buffer,
+int proc_dobool(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
struct ctl_table tmp;
@@ -739,7 +739,7 @@ int proc_dobool(struct ctl_table *table, int write, void *buffer,
*
* Returns 0 on success.
*/
-int proc_dointvec(struct ctl_table *table, int write, void *buffer,
+int proc_dointvec(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
@@ -758,7 +758,7 @@ int proc_dointvec(struct ctl_table *table, int write, void *buffer,
*
* Returns 0 on success.
*/
-int proc_douintvec(struct ctl_table *table, int write, void *buffer,
+int proc_douintvec(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
return do_proc_douintvec(table, write, buffer, lenp, ppos,
@@ -769,7 +769,7 @@ int proc_douintvec(struct ctl_table *table, int write, void *buffer,
* Taint values can only be increased
* This means we can safely use a temporary.
*/
-static int proc_taint(struct ctl_table *table, int write,
+static int proc_taint(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
@@ -864,7 +864,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
*
* Returns 0 on success or -EINVAL on write when the range check fails.
*/
-int proc_dointvec_minmax(struct ctl_table *table, int write,
+int proc_dointvec_minmax(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct do_proc_dointvec_minmax_conv_param param = {
@@ -933,7 +933,7 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
*
* Returns 0 on success or -ERANGE on write when the range check fails.
*/
-int proc_douintvec_minmax(struct ctl_table *table, int write,
+int proc_douintvec_minmax(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct do_proc_douintvec_minmax_conv_param param = {
@@ -961,7 +961,7 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
*
* Returns 0 on success or an error on write when the range check fails.
*/
-int proc_dou8vec_minmax(struct ctl_table *table, int write,
+int proc_dou8vec_minmax(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table tmp;
@@ -977,16 +977,10 @@ int proc_dou8vec_minmax(struct ctl_table *table, int write,
if (table->maxlen != sizeof(u8))
return -EINVAL;
- if (table->extra1) {
+ if (table->extra1)
min = *(unsigned int *) table->extra1;
- if (min > 255U)
- return -EINVAL;
- }
- if (table->extra2) {
+ if (table->extra2)
max = *(unsigned int *) table->extra2;
- if (max > 255U)
- return -EINVAL;
- }
tmp = *table;
@@ -1004,7 +998,7 @@ int proc_dou8vec_minmax(struct ctl_table *table, int write,
EXPORT_SYMBOL_GPL(proc_dou8vec_minmax);
#ifdef CONFIG_MAGIC_SYSRQ
-static int sysrq_sysctl_handler(struct ctl_table *table, int write,
+static int sysrq_sysctl_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int tmp, ret;
@@ -1023,8 +1017,9 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write,
}
#endif
-static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table,
- int write, void *buffer, size_t *lenp, loff_t *ppos,
+static int __do_proc_doulongvec_minmax(void *data,
+ const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos,
unsigned long convmul, unsigned long convdiv)
{
unsigned long *i, *min, *max;
@@ -1096,7 +1091,7 @@ out:
return err;
}
-static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
+static int do_proc_doulongvec_minmax(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul,
unsigned long convdiv)
{
@@ -1120,7 +1115,7 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
*
* Returns 0 on success.
*/
-int proc_doulongvec_minmax(struct ctl_table *table, int write,
+int proc_doulongvec_minmax(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
@@ -1143,7 +1138,7 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write,
*
* Returns 0 on success.
*/
-int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
+int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
return do_proc_doulongvec_minmax(table, write, buffer,
@@ -1264,14 +1259,14 @@ static int do_proc_dointvec_ms_jiffies_minmax_conv(bool *negp, unsigned long *lv
*
* Returns 0 on success.
*/
-int proc_dointvec_jiffies(struct ctl_table *table, int write,
+int proc_dointvec_jiffies(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
return do_proc_dointvec(table,write,buffer,lenp,ppos,
do_proc_dointvec_jiffies_conv,NULL);
}
-int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write,
+int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct do_proc_dointvec_minmax_conv_param param = {
@@ -1297,7 +1292,7 @@ int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write,
*
* Returns 0 on success.
*/
-int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
+int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
return do_proc_dointvec(table, write, buffer, lenp, ppos,
@@ -1320,14 +1315,14 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
*
* Returns 0 on success.
*/
-int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void *buffer,
+int proc_dointvec_ms_jiffies(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
return do_proc_dointvec(table, write, buffer, lenp, ppos,
do_proc_dointvec_ms_jiffies_conv, NULL);
}
-static int proc_do_cad_pid(struct ctl_table *table, int write, void *buffer,
+static int proc_do_cad_pid(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
struct pid *new_pid;
@@ -1366,7 +1361,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, void *buffer,
*
* Returns 0 on success.
*/
-int proc_do_large_bitmap(struct ctl_table *table, int write,
+int proc_do_large_bitmap(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int err = 0;
@@ -1498,85 +1493,85 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
#else /* CONFIG_PROC_SYSCTL */
-int proc_dostring(struct ctl_table *table, int write,
+int proc_dostring(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
-int proc_dobool(struct ctl_table *table, int write,
+int proc_dobool(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
-int proc_dointvec(struct ctl_table *table, int write,
+int proc_dointvec(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
-int proc_douintvec(struct ctl_table *table, int write,
+int proc_douintvec(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
-int proc_dointvec_minmax(struct ctl_table *table, int write,
+int proc_dointvec_minmax(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
-int proc_douintvec_minmax(struct ctl_table *table, int write,
+int proc_douintvec_minmax(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
-int proc_dou8vec_minmax(struct ctl_table *table, int write,
+int proc_dou8vec_minmax(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
-int proc_dointvec_jiffies(struct ctl_table *table, int write,
+int proc_dointvec_jiffies(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
-int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write,
+int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
-int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
+int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
-int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
+int proc_dointvec_ms_jiffies(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
-int proc_doulongvec_minmax(struct ctl_table *table, int write,
+int proc_doulongvec_minmax(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
-int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
+int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
-int proc_do_large_bitmap(struct ctl_table *table, int write,
+int proc_do_large_bitmap(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
@@ -1585,7 +1580,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
#endif /* CONFIG_PROC_SYSCTL */
#if defined(CONFIG_SYSCTL)
-int proc_do_static_key(struct ctl_table *table, int write,
+int proc_do_static_key(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct static_key *key = (struct static_key *)table->data;
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 95a7e1b7f1da..5c2daa7ad3f9 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -1,10 +1,18 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/irq_work.h>
#include <linux/spinlock.h>
#include <linux/task_work.h>
#include <linux/resume_user_mode.h>
static struct callback_head work_exited; /* all we need is ->next == NULL */
+static void task_work_set_notify_irq(struct irq_work *entry)
+{
+ test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
+}
+static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) =
+ IRQ_WORK_INIT_HARD(task_work_set_notify_irq);
+
/**
* task_work_add - ask the @task to execute @work->func()
* @task: the task which should run the callback
@@ -12,7 +20,7 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */
* @notify: how to notify the targeted task
*
* Queue @work for task_work_run() below and notify the @task if @notify
- * is @TWA_RESUME, @TWA_SIGNAL, or @TWA_SIGNAL_NO_IPI.
+ * is @TWA_RESUME, @TWA_SIGNAL, @TWA_SIGNAL_NO_IPI or @TWA_NMI_CURRENT.
*
* @TWA_SIGNAL works like signals, in that the it will interrupt the targeted
* task and run the task_work, regardless of whether the task is currently
@@ -24,6 +32,8 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */
* kernel anyway.
* @TWA_RESUME work is run only when the task exits the kernel and returns to
* user mode, or before entering guest mode.
+ * @TWA_NMI_CURRENT works like @TWA_RESUME, except it can only be used for the
+ * current @task and if the current context is NMI.
*
* Fails if the @task is exiting/exited and thus it can't process this @work.
* Otherwise @work->func() will be called when the @task goes through one of
@@ -44,8 +54,13 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
{
struct callback_head *head;
- /* record the work call stack in order to print it in KASAN reports */
- kasan_record_aux_stack(work);
+ if (notify == TWA_NMI_CURRENT) {
+ if (WARN_ON_ONCE(task != current))
+ return -EINVAL;
+ } else {
+ /* record the work call stack in order to print it in KASAN reports */
+ kasan_record_aux_stack(work);
+ }
head = READ_ONCE(task->task_works);
do {
@@ -66,6 +81,9 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
case TWA_SIGNAL_NO_IPI:
__set_notify_signal(task);
break;
+ case TWA_NMI_CURRENT:
+ irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume));
+ break;
default:
WARN_ON_ONCE(1);
break;
@@ -120,9 +138,9 @@ static bool task_work_func_match(struct callback_head *cb, void *data)
}
/**
- * task_work_cancel - cancel a pending work added by task_work_add()
- * @task: the task which should execute the work
- * @func: identifies the work to remove
+ * task_work_cancel_func - cancel a pending work matching a function added by task_work_add()
+ * @task: the task which should execute the func's work
+ * @func: identifies the func to match with a work to remove
*
* Find the last queued pending work with ->func == @func and remove
* it from queue.
@@ -131,11 +149,35 @@ static bool task_work_func_match(struct callback_head *cb, void *data)
* The found work or NULL if not found.
*/
struct callback_head *
-task_work_cancel(struct task_struct *task, task_work_func_t func)
+task_work_cancel_func(struct task_struct *task, task_work_func_t func)
{
return task_work_cancel_match(task, task_work_func_match, func);
}
+static bool task_work_match(struct callback_head *cb, void *data)
+{
+ return cb == data;
+}
+
+/**
+ * task_work_cancel - cancel a pending work added by task_work_add()
+ * @task: the task which should execute the work
+ * @cb: the callback to remove if queued
+ *
+ * Remove a callback from a task's queue if queued.
+ *
+ * RETURNS:
+ * True if the callback was queued and got cancelled, false otherwise.
+ */
+bool task_work_cancel(struct task_struct *task, struct callback_head *cb)
+{
+ struct callback_head *ret;
+
+ ret = task_work_cancel_match(task, task_work_match, cb);
+
+ return ret == cb;
+}
+
/**
* task_work_run - execute the works added by task_work_add()
*
@@ -168,7 +210,7 @@ void task_work_run(void)
if (!work)
break;
/*
- * Synchronize with task_work_cancel(). It can not remove
+ * Synchronize with task_work_cancel_match(). It can not remove
* the first entry == work, cmpxchg(task_works) must fail.
* But it can remove another entry from the ->next list.
*/
diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c
index d06185e054ea..62e73444ffe4 100644
--- a/kernel/time/clocksource-wdtest.c
+++ b/kernel/time/clocksource-wdtest.c
@@ -22,6 +22,7 @@
#include "tick-internal.h"
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Clocksource watchdog unit test");
MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>");
static int holdoff = IS_BUILTIN(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) ? 10 : 0;
diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c
index 20d5df631570..783f2297111b 100644
--- a/kernel/time/test_udelay.c
+++ b/kernel/time/test_udelay.c
@@ -155,5 +155,6 @@ static void __exit udelay_test_exit(void)
module_exit(udelay_test_exit);
+MODULE_DESCRIPTION("udelay test module");
MODULE_AUTHOR("David Riley <davidriley@chromium.org>");
MODULE_LICENSE("GPL");
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 771d1e040303..b4843099a8da 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -1141,6 +1141,7 @@ void tick_broadcast_switch_to_oneshot(void)
#ifdef CONFIG_HOTPLUG_CPU
void hotplug_cpu__broadcast_tick_pull(int deadcpu)
{
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
struct clock_event_device *bc;
unsigned long flags;
@@ -1148,6 +1149,28 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu)
bc = tick_broadcast_device.evtdev;
if (bc && broadcast_needs_cpu(bc, deadcpu)) {
+ /*
+ * If the broadcast force bit of the current CPU is set,
+ * then the current CPU has not yet reprogrammed the local
+ * timer device to avoid a ping-pong race. See
+ * ___tick_broadcast_oneshot_control().
+ *
+ * If the broadcast device is hrtimer based then
+ * programming the broadcast event below does not have any
+ * effect because the local clockevent device is not
+ * running and not programmed because the broadcast event
+ * is not earlier than the pending event of the local clock
+ * event device. As a consequence all CPUs waiting for a
+ * broadcast event are stuck forever.
+ *
+ * Detect this condition and reprogram the cpu local timer
+ * device to avoid the starvation.
+ */
+ if (tick_check_broadcast_expired()) {
+ cpumask_clear_cpu(smp_processor_id(), tick_broadcast_force_mask);
+ tick_program_event(td->evtdev->next_event, 1);
+ }
+
/* This moves the broadcast assignment to this CPU: */
clockevents_program_event(bc, bc->next_event, 1);
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 71a792cd8936..753a184c7090 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1026,10 +1026,10 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
return;
- WARN_ON_ONCE(1);
- printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n",
- basemono, ts->next_tick, dev->next_event,
- hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer));
+ WARN_ONCE(1, "basemono: %llu ts->next_tick: %llu dev->next_event: %llu "
+ "timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick,
+ dev->next_event, hrtimer_active(&ts->sched_timer),
+ hrtimer_get_expires(&ts->sched_timer));
}
/*
@@ -1385,20 +1385,6 @@ unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
return ts->idle_calls;
}
-/**
- * tick_nohz_get_idle_calls - return the current idle calls counter value
- *
- * Called from the schedutil frequency scaling governor in scheduler context.
- *
- * Return: the current idle calls counter value for the current CPU
- */
-unsigned long tick_nohz_get_idle_calls(void)
-{
- struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
-
- return ts->idle_calls;
-}
-
static void tick_nohz_account_idle_time(struct tick_sched *ts,
ktime_t now)
{
diff --git a/kernel/time/time_test.c b/kernel/time/time_test.c
index 3e5d422dd15c..2889763165e5 100644
--- a/kernel/time/time_test.c
+++ b/kernel/time/time_test.c
@@ -96,4 +96,5 @@ static struct kunit_suite time_test_suite = {
};
kunit_test_suite(time_test_suite);
+MODULE_DESCRIPTION("time unit test suite");
MODULE_LICENSE("GPL");
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 4e18db1819f8..2fa87dcfeda9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1195,6 +1195,108 @@ static bool timestamp_in_interval(u64 start, u64 end, u64 ts)
return false;
}
+static bool convert_clock(u64 *val, u32 numerator, u32 denominator)
+{
+ u64 rem, res;
+
+ if (!numerator || !denominator)
+ return false;
+
+ res = div64_u64_rem(*val, denominator, &rem) * numerator;
+ *val = res + div_u64(rem * numerator, denominator);
+ return true;
+}
+
+static bool convert_base_to_cs(struct system_counterval_t *scv)
+{
+ struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock;
+ struct clocksource_base *base;
+ u32 num, den;
+
+ /* The timestamp was taken from the time keeper clock source */
+ if (cs->id == scv->cs_id)
+ return true;
+
+ /*
+ * Check whether cs_id matches the base clock. Prevent the compiler from
+ * re-evaluating @base as the clocksource might change concurrently.
+ */
+ base = READ_ONCE(cs->base);
+ if (!base || base->id != scv->cs_id)
+ return false;
+
+ num = scv->use_nsecs ? cs->freq_khz : base->numerator;
+ den = scv->use_nsecs ? USEC_PER_SEC : base->denominator;
+
+ if (!convert_clock(&scv->cycles, num, den))
+ return false;
+
+ scv->cycles += base->offset;
+ return true;
+}
+
+static bool convert_cs_to_base(u64 *cycles, enum clocksource_ids base_id)
+{
+ struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock;
+ struct clocksource_base *base;
+
+ /*
+ * Check whether base_id matches the base clock. Prevent the compiler from
+ * re-evaluating @base as the clocksource might change concurrently.
+ */
+ base = READ_ONCE(cs->base);
+ if (!base || base->id != base_id)
+ return false;
+
+ *cycles -= base->offset;
+ if (!convert_clock(cycles, base->denominator, base->numerator))
+ return false;
+ return true;
+}
+
+static bool convert_ns_to_cs(u64 *delta)
+{
+ struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono;
+
+ if (BITS_TO_BYTES(fls64(*delta) + tkr->shift) >= sizeof(*delta))
+ return false;
+
+ *delta = div_u64((*delta << tkr->shift) - tkr->xtime_nsec, tkr->mult);
+ return true;
+}
+
+/**
+ * ktime_real_to_base_clock() - Convert CLOCK_REALTIME timestamp to a base clock timestamp
+ * @treal: CLOCK_REALTIME timestamp to convert
+ * @base_id: base clocksource id
+ * @cycles: pointer to store the converted base clock timestamp
+ *
+ * Converts a supplied, future realtime clock value to the corresponding base clock value.
+ *
+ * Return: true if the conversion is successful, false otherwise.
+ */
+bool ktime_real_to_base_clock(ktime_t treal, enum clocksource_ids base_id, u64 *cycles)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned int seq;
+ u64 delta;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ if ((u64)treal < tk->tkr_mono.base_real)
+ return false;
+ delta = (u64)treal - tk->tkr_mono.base_real;
+ if (!convert_ns_to_cs(&delta))
+ return false;
+ *cycles = tk->tkr_mono.cycle_last + delta;
+ if (!convert_cs_to_base(cycles, base_id))
+ return false;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(ktime_real_to_base_clock);
+
/**
* get_device_system_crosststamp - Synchronously capture system/device timestamp
* @get_time_fn: Callback to get simultaneous device time and
@@ -1241,7 +1343,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
* installed timekeeper clocksource
*/
if (system_counterval.cs_id == CSID_GENERIC ||
- tk->tkr_mono.clock->id != system_counterval.cs_id)
+ !convert_base_to_cs(&system_counterval))
return -ENODEV;
cycles = system_counterval.cycles;
@@ -1307,6 +1409,30 @@ int get_device_system_crosststamp(int (*get_time_fn)
EXPORT_SYMBOL_GPL(get_device_system_crosststamp);
/**
+ * timekeeping_clocksource_has_base - Check whether the current clocksource
+ * is based on given a base clock
+ * @id: base clocksource ID
+ *
+ * Note: The return value is a snapshot which can become invalid right
+ * after the function returns.
+ *
+ * Return: true if the timekeeper clocksource has a base clock with @id,
+ * false otherwise
+ */
+bool timekeeping_clocksource_has_base(enum clocksource_ids id)
+{
+ /*
+ * This is a snapshot, so no point in using the sequence
+ * count. Just prevent the compiler from re-evaluating @base as the
+ * clocksource might change concurrently.
+ */
+ struct clocksource_base *base = READ_ONCE(tk_core.timekeeper.tkr_mono.clock->base);
+
+ return base ? base->id == id : false;
+}
+EXPORT_SYMBOL_GPL(timekeeping_clocksource_has_base);
+
+/**
* do_settimeofday64 - Sets the time of day.
* @ts: pointer to the timespec64 variable containing the new time
*
@@ -2421,6 +2547,7 @@ EXPORT_SYMBOL_GPL(random_get_entropy_fallback);
/**
* do_adjtimex() - Accessor function to NTP __do_adjtimex function
+ * @txc: Pointer to kernel_timex structure containing NTP parameters
*/
int do_adjtimex(struct __kernel_timex *txc)
{
@@ -2489,6 +2616,8 @@ int do_adjtimex(struct __kernel_timex *txc)
#ifdef CONFIG_NTP_PPS
/**
* hardpps() - Accessor function to NTP __hardpps function
+ * @phase_ts: Pointer to timespec64 structure representing phase timestamp
+ * @raw_ts: Pointer to timespec64 structure representing raw timestamp
*/
void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
{
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 48288dd4a102..64b0d8a0aa0f 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -289,7 +289,7 @@ static void timers_update_migration(void)
}
#ifdef CONFIG_SYSCTL
-static int timer_migration_handler(struct ctl_table *table, int write,
+static int timer_migration_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
diff --git a/kernel/torture.c b/kernel/torture.c
index c72ab2d251f4..dede150aef01 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -40,6 +40,7 @@
#include <linux/sched/rt.h>
#include "rcu/rcu.h"
+MODULE_DESCRIPTION("Common functions for in-kernel torture tests");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d1daeab1bbc1..cd098846e251 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1369,8 +1369,8 @@ __bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
/**
* bpf_verify_pkcs7_signature - verify a PKCS#7 signature
- * @data_ptr: data to verify
- * @sig_ptr: signature of the data
+ * @data_p: data to verify
+ * @sig_p: signature of the data
* @trusted_keyring: keyring with keys trusted for signature verification
*
* Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr*
@@ -1378,10 +1378,12 @@ __bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
*
* Return: 0 on success, a negative value on error.
*/
-__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
- struct bpf_dynptr_kern *sig_ptr,
+__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p,
+ struct bpf_dynptr *sig_p,
struct bpf_key *trusted_keyring)
{
+ struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p;
+ struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p;
const void *data, *sig;
u32 data_len, sig_len;
int ret;
@@ -1444,7 +1446,7 @@ __bpf_kfunc_start_defs();
* bpf_get_file_xattr - get xattr of a file
* @file: file to get xattr from
* @name__str: name of the xattr
- * @value_ptr: output buffer of the xattr value
+ * @value_p: output buffer of the xattr value
*
* Get xattr *name__str* of *file* and store the output in *value_ptr*.
*
@@ -1453,8 +1455,9 @@ __bpf_kfunc_start_defs();
* Return: 0 on success, a negative value on error.
*/
__bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str,
- struct bpf_dynptr_kern *value_ptr)
+ struct bpf_dynptr *value_p)
{
+ struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p;
struct dentry *dentry;
u32 value_len;
void *value;
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index a130b2d898f7..fc205ad167a9 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -7,9 +7,11 @@
*
* Highly modified by Steven Rostedt (VMware).
*/
+#include <linux/bits.h>
#include <linux/jump_label.h>
#include <linux/suspend.h>
#include <linux/ftrace.h>
+#include <linux/static_call.h>
#include <linux/slab.h>
#include <trace/events/sched.h>
@@ -17,17 +19,447 @@
#include "ftrace_internal.h"
#include "trace.h"
-#ifdef CONFIG_DYNAMIC_FTRACE
-#define ASSIGN_OPS_HASH(opsname, val) \
- .func_hash = val, \
- .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
-#else
-#define ASSIGN_OPS_HASH(opsname, val)
-#endif
+/*
+ * FGRAPH_FRAME_SIZE: Size in bytes of the meta data on the shadow stack
+ * FGRAPH_FRAME_OFFSET: Size in long words of the meta data frame
+ */
+#define FGRAPH_FRAME_SIZE sizeof(struct ftrace_ret_stack)
+#define FGRAPH_FRAME_OFFSET DIV_ROUND_UP(FGRAPH_FRAME_SIZE, sizeof(long))
+
+/*
+ * On entry to a function (via function_graph_enter()), a new fgraph frame
+ * (ftrace_ret_stack) is pushed onto the stack as well as a word that
+ * holds a bitmask and a type (called "bitmap"). The bitmap is defined as:
+ *
+ * bits: 0 - 9 offset in words from the previous ftrace_ret_stack
+ *
+ * bits: 10 - 11 Type of storage
+ * 0 - reserved
+ * 1 - bitmap of fgraph_array index
+ * 2 - reserved data
+ *
+ * For type with "bitmap of fgraph_array index" (FGRAPH_TYPE_BITMAP):
+ * bits: 12 - 27 The bitmap of fgraph_ops fgraph_array index
+ * That is, it's a bitmask of 0-15 (16 bits)
+ * where if a corresponding ops in the fgraph_array[]
+ * expects a callback from the return of the function
+ * it's corresponding bit will be set.
+ *
+ *
+ * The top of the ret_stack (when not empty) will always have a reference
+ * word that points to the last fgraph frame that was saved.
+ *
+ * For reserved data:
+ * bits: 12 - 17 The size in words that is stored
+ * bits: 18 - 23 The index of fgraph_array, which shows who is stored
+ *
+ * That is, at the end of function_graph_enter, if the first and forth
+ * fgraph_ops on the fgraph_array[] (index 0 and 3) needs their retfunc called
+ * on the return of the function being traced, and the forth fgraph_ops
+ * stored two words of data, this is what will be on the task's shadow
+ * ret_stack: (the stack grows upward)
+ *
+ * ret_stack[SHADOW_STACK_OFFSET]
+ * | SHADOW_STACK_TASK_VARS(ret_stack)[15] |
+ * ...
+ * | SHADOW_STACK_TASK_VARS(ret_stack)[0] |
+ * ret_stack[SHADOW_STACK_MAX_OFFSET]
+ * ...
+ * | | <- task->curr_ret_stack
+ * +--------------------------------------------+
+ * | (3 << 12) | (3 << 10) | FGRAPH_FRAME_OFFSET|
+ * | *or put another way* |
+ * | (3 << FGRAPH_DATA_INDEX_SHIFT)| \ | This is for fgraph_ops[3].
+ * | ((2 - 1) << FGRAPH_DATA_SHIFT)| \ | The data size is 2 words.
+ * | (FGRAPH_TYPE_DATA << FGRAPH_TYPE_SHIFT)| \ |
+ * | (offset2:FGRAPH_FRAME_OFFSET+3) | <- the offset2 is from here
+ * +--------------------------------------------+ ( It is 4 words from the ret_stack)
+ * | STORED DATA WORD 2 |
+ * | STORED DATA WORD 1 |
+ * +--------------------------------------------+
+ * | (9 << 12) | (1 << 10) | FGRAPH_FRAME_OFFSET|
+ * | *or put another way* |
+ * | (BIT(3)|BIT(0)) << FGRAPH_INDEX_SHIFT | \ |
+ * | FGRAPH_TYPE_BITMAP << FGRAPH_TYPE_SHIFT| \ |
+ * | (offset1:FGRAPH_FRAME_OFFSET) | <- the offset1 is from here
+ * +--------------------------------------------+
+ * | struct ftrace_ret_stack |
+ * | (stores the saved ret pointer) | <- the offset points here
+ * +--------------------------------------------+
+ * | (X) | (N) | ( N words away from
+ * | | previous ret_stack)
+ * ...
+ * ret_stack[0]
+ *
+ * If a backtrace is required, and the real return pointer needs to be
+ * fetched, then it looks at the task's curr_ret_stack offset, if it
+ * is greater than zero (reserved, or right before popped), it would mask
+ * the value by FGRAPH_FRAME_OFFSET_MASK to get the offset of the
+ * ftrace_ret_stack structure stored on the shadow stack.
+ */
+
+/*
+ * The following is for the top word on the stack:
+ *
+ * FGRAPH_FRAME_OFFSET (0-9) holds the offset delta to the fgraph frame
+ * FGRAPH_TYPE (10-11) holds the type of word this is.
+ * (RESERVED or BITMAP)
+ */
+#define FGRAPH_FRAME_OFFSET_BITS 10
+#define FGRAPH_FRAME_OFFSET_MASK GENMASK(FGRAPH_FRAME_OFFSET_BITS - 1, 0)
+
+#define FGRAPH_TYPE_BITS 2
+#define FGRAPH_TYPE_MASK GENMASK(FGRAPH_TYPE_BITS - 1, 0)
+#define FGRAPH_TYPE_SHIFT FGRAPH_FRAME_OFFSET_BITS
+
+enum {
+ FGRAPH_TYPE_RESERVED = 0,
+ FGRAPH_TYPE_BITMAP = 1,
+ FGRAPH_TYPE_DATA = 2,
+};
+
+/*
+ * For BITMAP type:
+ * FGRAPH_INDEX (12-27) bits holding the gops index wanting return callback called
+ */
+#define FGRAPH_INDEX_BITS 16
+#define FGRAPH_INDEX_MASK GENMASK(FGRAPH_INDEX_BITS - 1, 0)
+#define FGRAPH_INDEX_SHIFT (FGRAPH_TYPE_SHIFT + FGRAPH_TYPE_BITS)
+
+/*
+ * For DATA type:
+ * FGRAPH_DATA (12-17) bits hold the size of data (in words)
+ * FGRAPH_INDEX (18-23) bits hold the index for which gops->idx the data is for
+ *
+ * Note:
+ * data_size == 0 means 1 word, and 31 (=2^5 - 1) means 32 words.
+ */
+#define FGRAPH_DATA_BITS 5
+#define FGRAPH_DATA_MASK GENMASK(FGRAPH_DATA_BITS - 1, 0)
+#define FGRAPH_DATA_SHIFT (FGRAPH_TYPE_SHIFT + FGRAPH_TYPE_BITS)
+#define FGRAPH_MAX_DATA_SIZE (sizeof(long) * (1 << FGRAPH_DATA_BITS))
+
+#define FGRAPH_DATA_INDEX_BITS 4
+#define FGRAPH_DATA_INDEX_MASK GENMASK(FGRAPH_DATA_INDEX_BITS - 1, 0)
+#define FGRAPH_DATA_INDEX_SHIFT (FGRAPH_DATA_SHIFT + FGRAPH_DATA_BITS)
+
+#define FGRAPH_MAX_INDEX \
+ ((FGRAPH_INDEX_SIZE << FGRAPH_DATA_BITS) + FGRAPH_RET_INDEX)
+
+#define FGRAPH_ARRAY_SIZE FGRAPH_INDEX_BITS
+
+/*
+ * SHADOW_STACK_SIZE: The size in bytes of the entire shadow stack
+ * SHADOW_STACK_OFFSET: The size in long words of the shadow stack
+ * SHADOW_STACK_MAX_OFFSET: The max offset of the stack for a new frame to be added
+ */
+#define SHADOW_STACK_SIZE (PAGE_SIZE)
+#define SHADOW_STACK_OFFSET (SHADOW_STACK_SIZE / sizeof(long))
+/* Leave on a buffer at the end */
+#define SHADOW_STACK_MAX_OFFSET \
+ (SHADOW_STACK_OFFSET - (FGRAPH_FRAME_OFFSET + 1 + FGRAPH_ARRAY_SIZE))
+
+/* RET_STACK(): Return the frame from a given @offset from task @t */
+#define RET_STACK(t, offset) ((struct ftrace_ret_stack *)(&(t)->ret_stack[offset]))
+
+/*
+ * Each fgraph_ops has a reservered unsigned long at the end (top) of the
+ * ret_stack to store task specific state.
+ */
+#define SHADOW_STACK_TASK_VARS(ret_stack) \
+ ((unsigned long *)(&(ret_stack)[SHADOW_STACK_OFFSET - FGRAPH_ARRAY_SIZE]))
DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph);
int ftrace_graph_active;
+static struct fgraph_ops *fgraph_array[FGRAPH_ARRAY_SIZE];
+static unsigned long fgraph_array_bitmask;
+
+/* LRU index table for fgraph_array */
+static int fgraph_lru_table[FGRAPH_ARRAY_SIZE];
+static int fgraph_lru_next;
+static int fgraph_lru_last;
+
+/* Initialize fgraph_lru_table with unused index */
+static void fgraph_lru_init(void)
+{
+ int i;
+
+ for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)
+ fgraph_lru_table[i] = i;
+}
+
+/* Release the used index to the LRU table */
+static int fgraph_lru_release_index(int idx)
+{
+ if (idx < 0 || idx >= FGRAPH_ARRAY_SIZE ||
+ WARN_ON_ONCE(fgraph_lru_table[fgraph_lru_last] != -1))
+ return -1;
+
+ fgraph_lru_table[fgraph_lru_last] = idx;
+ fgraph_lru_last = (fgraph_lru_last + 1) % FGRAPH_ARRAY_SIZE;
+
+ clear_bit(idx, &fgraph_array_bitmask);
+ return 0;
+}
+
+/* Allocate a new index from LRU table */
+static int fgraph_lru_alloc_index(void)
+{
+ int idx = fgraph_lru_table[fgraph_lru_next];
+
+ /* No id is available */
+ if (idx == -1)
+ return -1;
+
+ fgraph_lru_table[fgraph_lru_next] = -1;
+ fgraph_lru_next = (fgraph_lru_next + 1) % FGRAPH_ARRAY_SIZE;
+
+ set_bit(idx, &fgraph_array_bitmask);
+ return idx;
+}
+
+/* Get the offset to the fgraph frame from a ret_stack value */
+static inline int __get_offset(unsigned long val)
+{
+ return val & FGRAPH_FRAME_OFFSET_MASK;
+}
+
+/* Get the type of word from a ret_stack value */
+static inline int __get_type(unsigned long val)
+{
+ return (val >> FGRAPH_TYPE_SHIFT) & FGRAPH_TYPE_MASK;
+}
+
+/* Get the data_index for a DATA type ret_stack word */
+static inline int __get_data_index(unsigned long val)
+{
+ return (val >> FGRAPH_DATA_INDEX_SHIFT) & FGRAPH_DATA_INDEX_MASK;
+}
+
+/* Get the data_size for a DATA type ret_stack word */
+static inline int __get_data_size(unsigned long val)
+{
+ return ((val >> FGRAPH_DATA_SHIFT) & FGRAPH_DATA_MASK) + 1;
+}
+
+/* Get the word from the ret_stack at @offset */
+static inline unsigned long get_fgraph_entry(struct task_struct *t, int offset)
+{
+ return t->ret_stack[offset];
+}
+
+/* Get the FRAME_OFFSET from the word from the @offset on ret_stack */
+static inline int get_frame_offset(struct task_struct *t, int offset)
+{
+ return __get_offset(t->ret_stack[offset]);
+}
+
+/* For BITMAP type: get the bitmask from the @offset at ret_stack */
+static inline unsigned long
+get_bitmap_bits(struct task_struct *t, int offset)
+{
+ return (t->ret_stack[offset] >> FGRAPH_INDEX_SHIFT) & FGRAPH_INDEX_MASK;
+}
+
+/* Write the bitmap to the ret_stack at @offset (does index, offset and bitmask) */
+static inline void
+set_bitmap(struct task_struct *t, int offset, unsigned long bitmap)
+{
+ t->ret_stack[offset] = (bitmap << FGRAPH_INDEX_SHIFT) |
+ (FGRAPH_TYPE_BITMAP << FGRAPH_TYPE_SHIFT) | FGRAPH_FRAME_OFFSET;
+}
+
+/* For DATA type: get the data saved under the ret_stack word at @offset */
+static inline void *get_data_type_data(struct task_struct *t, int offset)
+{
+ unsigned long val = t->ret_stack[offset];
+
+ if (__get_type(val) != FGRAPH_TYPE_DATA)
+ return NULL;
+ offset -= __get_data_size(val);
+ return (void *)&t->ret_stack[offset];
+}
+
+/* Create the ret_stack word for a DATA type */
+static inline unsigned long make_data_type_val(int idx, int size, int offset)
+{
+ return (idx << FGRAPH_DATA_INDEX_SHIFT) |
+ ((size - 1) << FGRAPH_DATA_SHIFT) |
+ (FGRAPH_TYPE_DATA << FGRAPH_TYPE_SHIFT) | offset;
+}
+
+/* ftrace_graph_entry set to this to tell some archs to run function graph */
+static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops)
+{
+ return 0;
+}
+
+/* ftrace_graph_return set to this to tell some archs to run function graph */
+static void return_run(struct ftrace_graph_ret *trace, struct fgraph_ops *ops)
+{
+}
+
+static void ret_stack_set_task_var(struct task_struct *t, int idx, long val)
+{
+ unsigned long *gvals = SHADOW_STACK_TASK_VARS(t->ret_stack);
+
+ gvals[idx] = val;
+}
+
+static unsigned long *
+ret_stack_get_task_var(struct task_struct *t, int idx)
+{
+ unsigned long *gvals = SHADOW_STACK_TASK_VARS(t->ret_stack);
+
+ return &gvals[idx];
+}
+
+static void ret_stack_init_task_vars(unsigned long *ret_stack)
+{
+ unsigned long *gvals = SHADOW_STACK_TASK_VARS(ret_stack);
+
+ memset(gvals, 0, sizeof(*gvals) * FGRAPH_ARRAY_SIZE);
+}
+
+/**
+ * fgraph_reserve_data - Reserve storage on the task's ret_stack
+ * @idx: The index of fgraph_array
+ * @size_bytes: The size in bytes to reserve
+ *
+ * Reserves space of up to FGRAPH_MAX_DATA_SIZE bytes on the
+ * task's ret_stack shadow stack, for a given fgraph_ops during
+ * the entryfunc() call. If entryfunc() returns zero, the storage
+ * is discarded. An entryfunc() can only call this once per iteration.
+ * The fgraph_ops retfunc() can retrieve this stored data with
+ * fgraph_retrieve_data().
+ *
+ * Returns: On success, a pointer to the data on the stack.
+ * Otherwise, NULL if there's not enough space left on the
+ * ret_stack for the data, or if fgraph_reserve_data() was called
+ * more than once for a single entryfunc() call.
+ */
+void *fgraph_reserve_data(int idx, int size_bytes)
+{
+ unsigned long val;
+ void *data;
+ int curr_ret_stack = current->curr_ret_stack;
+ int data_size;
+
+ if (size_bytes > FGRAPH_MAX_DATA_SIZE)
+ return NULL;
+
+ /* Convert the data size to number of longs. */
+ data_size = (size_bytes + sizeof(long) - 1) >> (sizeof(long) == 4 ? 2 : 3);
+
+ val = get_fgraph_entry(current, curr_ret_stack - 1);
+ data = &current->ret_stack[curr_ret_stack];
+
+ curr_ret_stack += data_size + 1;
+ if (unlikely(curr_ret_stack >= SHADOW_STACK_MAX_OFFSET))
+ return NULL;
+
+ val = make_data_type_val(idx, data_size, __get_offset(val) + data_size + 1);
+
+ /* Set the last word to be reserved */
+ current->ret_stack[curr_ret_stack - 1] = val;
+
+ /* Make sure interrupts see this */
+ barrier();
+ current->curr_ret_stack = curr_ret_stack;
+ /* Again sync with interrupts, and reset reserve */
+ current->ret_stack[curr_ret_stack - 1] = val;
+
+ return data;
+}
+
+/**
+ * fgraph_retrieve_data - Retrieve stored data from fgraph_reserve_data()
+ * @idx: the index of fgraph_array (fgraph_ops::idx)
+ * @size_bytes: pointer to retrieved data size.
+ *
+ * This is to be called by a fgraph_ops retfunc(), to retrieve data that
+ * was stored by the fgraph_ops entryfunc() on the function entry.
+ * That is, this will retrieve the data that was reserved on the
+ * entry of the function that corresponds to the exit of the function
+ * that the fgraph_ops retfunc() is called on.
+ *
+ * Returns: The stored data from fgraph_reserve_data() called by the
+ * matching entryfunc() for the retfunc() this is called from.
+ * Or NULL if there was nothing stored.
+ */
+void *fgraph_retrieve_data(int idx, int *size_bytes)
+{
+ int offset = current->curr_ret_stack - 1;
+ unsigned long val;
+
+ val = get_fgraph_entry(current, offset);
+ while (__get_type(val) == FGRAPH_TYPE_DATA) {
+ if (__get_data_index(val) == idx)
+ goto found;
+ offset -= __get_data_size(val) + 1;
+ val = get_fgraph_entry(current, offset);
+ }
+ return NULL;
+found:
+ if (size_bytes)
+ *size_bytes = __get_data_size(val) * sizeof(long);
+ return get_data_type_data(current, offset);
+}
+
+/**
+ * fgraph_get_task_var - retrieve a task specific state variable
+ * @gops: The ftrace_ops that owns the task specific variable
+ *
+ * Every registered fgraph_ops has a task state variable
+ * reserved on the task's ret_stack. This function returns the
+ * address to that variable.
+ *
+ * Returns the address to the fgraph_ops @gops tasks specific
+ * unsigned long variable.
+ */
+unsigned long *fgraph_get_task_var(struct fgraph_ops *gops)
+{
+ return ret_stack_get_task_var(current, gops->idx);
+}
+
+/*
+ * @offset: The offset into @t->ret_stack to find the ret_stack entry
+ * @frame_offset: Where to place the offset into @t->ret_stack of that entry
+ *
+ * Returns a pointer to the previous ret_stack below @offset or NULL
+ * when it reaches the bottom of the stack.
+ *
+ * Calling this with:
+ *
+ * offset = task->curr_ret_stack;
+ * do {
+ * ret_stack = get_ret_stack(task, offset, &offset);
+ * } while (ret_stack);
+ *
+ * Will iterate through all the ret_stack entries from curr_ret_stack
+ * down to the first one.
+ */
+static inline struct ftrace_ret_stack *
+get_ret_stack(struct task_struct *t, int offset, int *frame_offset)
+{
+ int offs;
+
+ BUILD_BUG_ON(FGRAPH_FRAME_SIZE % sizeof(long));
+
+ if (unlikely(offset <= 0))
+ return NULL;
+
+ offs = get_frame_offset(t, --offset);
+ if (WARN_ON_ONCE(offs <= 0 || offs > offset))
+ return NULL;
+
+ offset -= offs;
+
+ *frame_offset = offset;
+ return RET_STACK(t, offset);
+}
+
/* Both enabled by default (can be cleared by function_graph tracer flags */
static bool fgraph_sleep_time = true;
@@ -51,6 +483,27 @@ int __weak ftrace_disable_ftrace_graph_caller(void)
}
#endif
+int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops)
+{
+ return 0;
+}
+
+static void ftrace_graph_ret_stub(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops)
+{
+}
+
+static struct fgraph_ops fgraph_stub = {
+ .entryfunc = ftrace_graph_entry_stub,
+ .retfunc = ftrace_graph_ret_stub,
+};
+
+static struct fgraph_ops *fgraph_direct_gops = &fgraph_stub;
+DEFINE_STATIC_CALL(fgraph_func, ftrace_graph_entry_stub);
+DEFINE_STATIC_CALL(fgraph_retfunc, ftrace_graph_ret_stub);
+static DEFINE_STATIC_KEY_TRUE(fgraph_do_direct);
+
/**
* ftrace_graph_stop - set to permanently disable function graph tracing
*
@@ -67,10 +520,13 @@ void ftrace_graph_stop(void)
/* Add a function return address to the trace stack on thread info.*/
static int
ftrace_push_return_trace(unsigned long ret, unsigned long func,
- unsigned long frame_pointer, unsigned long *retp)
+ unsigned long frame_pointer, unsigned long *retp,
+ int fgraph_idx)
{
+ struct ftrace_ret_stack *ret_stack;
unsigned long long calltime;
- int index;
+ unsigned long val;
+ int offset;
if (unlikely(ftrace_graph_is_dead()))
return -EBUSY;
@@ -78,32 +534,67 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
if (!current->ret_stack)
return -EBUSY;
+ BUILD_BUG_ON(SHADOW_STACK_SIZE % sizeof(long));
+
+ /* Set val to "reserved" with the delta to the new fgraph frame */
+ val = (FGRAPH_TYPE_RESERVED << FGRAPH_TYPE_SHIFT) | FGRAPH_FRAME_OFFSET;
+
/*
* We must make sure the ret_stack is tested before we read
* anything else.
*/
smp_rmb();
- /* The return trace stack is full */
- if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
+ /*
+ * Check if there's room on the shadow stack to fit a fraph frame
+ * and a bitmap word.
+ */
+ if (current->curr_ret_stack + FGRAPH_FRAME_OFFSET + 1 >= SHADOW_STACK_MAX_OFFSET) {
atomic_inc(&current->trace_overrun);
return -EBUSY;
}
calltime = trace_clock_local();
- index = ++current->curr_ret_stack;
+ offset = READ_ONCE(current->curr_ret_stack);
+ ret_stack = RET_STACK(current, offset);
+ offset += FGRAPH_FRAME_OFFSET;
+
+ /* ret offset = FGRAPH_FRAME_OFFSET ; type = reserved */
+ current->ret_stack[offset] = val;
+ ret_stack->ret = ret;
+ /*
+ * The unwinders expect curr_ret_stack to point to either zero
+ * or an offset where to find the next ret_stack. Even though the
+ * ret stack might be bogus, we want to write the ret and the
+ * offset to find the ret_stack before we increment the stack point.
+ * If an interrupt comes in now before we increment the curr_ret_stack
+ * it may blow away what we wrote. But that's fine, because the
+ * offset will still be correct (even though the 'ret' won't be).
+ * What we worry about is the offset being correct after we increment
+ * the curr_ret_stack and before we update that offset, as if an
+ * interrupt comes in and does an unwind stack dump, it will need
+ * at least a correct offset!
+ */
+ barrier();
+ WRITE_ONCE(current->curr_ret_stack, offset + 1);
+ /*
+ * This next barrier is to ensure that an interrupt coming in
+ * will not corrupt what we are about to write.
+ */
barrier();
- current->ret_stack[index].ret = ret;
- current->ret_stack[index].func = func;
- current->ret_stack[index].calltime = calltime;
+
+ /* Still keep it reserved even if an interrupt came in */
+ current->ret_stack[offset] = val;
+
+ ret_stack->ret = ret;
+ ret_stack->func = func;
+ ret_stack->calltime = calltime;
#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
- current->ret_stack[index].fp = frame_pointer;
-#endif
-#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
- current->ret_stack[index].retp = retp;
+ ret_stack->fp = frame_pointer;
#endif
- return 0;
+ ret_stack->retp = retp;
+ return offset;
}
/*
@@ -120,44 +611,85 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
# define MCOUNT_INSN_SIZE 0
#endif
+/* If the caller does not use ftrace, call this function. */
int function_graph_enter(unsigned long ret, unsigned long func,
unsigned long frame_pointer, unsigned long *retp)
{
struct ftrace_graph_ent trace;
+ unsigned long bitmap = 0;
+ int offset;
+ int i;
trace.func = func;
trace.depth = ++current->curr_ret_depth;
- if (ftrace_push_return_trace(ret, func, frame_pointer, retp))
+ offset = ftrace_push_return_trace(ret, func, frame_pointer, retp, 0);
+ if (offset < 0)
goto out;
- /* Only trace if the calling function expects to */
- if (!ftrace_graph_entry(&trace))
+#ifdef CONFIG_HAVE_STATIC_CALL
+ if (static_branch_likely(&fgraph_do_direct)) {
+ int save_curr_ret_stack = current->curr_ret_stack;
+
+ if (static_call(fgraph_func)(&trace, fgraph_direct_gops))
+ bitmap |= BIT(fgraph_direct_gops->idx);
+ else
+ /* Clear out any saved storage */
+ current->curr_ret_stack = save_curr_ret_stack;
+ } else
+#endif
+ {
+ for_each_set_bit(i, &fgraph_array_bitmask,
+ sizeof(fgraph_array_bitmask) * BITS_PER_BYTE) {
+ struct fgraph_ops *gops = READ_ONCE(fgraph_array[i]);
+ int save_curr_ret_stack;
+
+ if (gops == &fgraph_stub)
+ continue;
+
+ save_curr_ret_stack = current->curr_ret_stack;
+ if (ftrace_ops_test(&gops->ops, func, NULL) &&
+ gops->entryfunc(&trace, gops))
+ bitmap |= BIT(i);
+ else
+ /* Clear out any saved storage */
+ current->curr_ret_stack = save_curr_ret_stack;
+ }
+ }
+
+ if (!bitmap)
goto out_ret;
+ /*
+ * Since this function uses fgraph_idx = 0 as a tail-call checking
+ * flag, set that bit always.
+ */
+ set_bitmap(current, offset, bitmap | BIT(0));
+
return 0;
out_ret:
- current->curr_ret_stack--;
+ current->curr_ret_stack -= FGRAPH_FRAME_OFFSET + 1;
out:
current->curr_ret_depth--;
return -EBUSY;
}
/* Retrieve a function return address to the trace stack on thread info.*/
-static void
+static struct ftrace_ret_stack *
ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
- unsigned long frame_pointer)
+ unsigned long frame_pointer, int *offset)
{
- int index;
+ struct ftrace_ret_stack *ret_stack;
- index = current->curr_ret_stack;
+ ret_stack = get_ret_stack(current, current->curr_ret_stack, offset);
- if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) {
+ if (unlikely(!ret_stack)) {
ftrace_graph_stop();
- WARN_ON(1);
+ WARN(1, "Bad function graph ret_stack pointer: %d",
+ current->curr_ret_stack);
/* Might as well panic, otherwise we have no where to go */
*ret = (unsigned long)panic;
- return;
+ return NULL;
}
#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
@@ -175,30 +707,33 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
* Note, -mfentry does not use frame pointers, and this test
* is not needed if CC_USING_FENTRY is set.
*/
- if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
+ if (unlikely(ret_stack->fp != frame_pointer)) {
ftrace_graph_stop();
WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
" from func %ps return to %lx\n",
- current->ret_stack[index].fp,
+ ret_stack->fp,
frame_pointer,
- (void *)current->ret_stack[index].func,
- current->ret_stack[index].ret);
+ (void *)ret_stack->func,
+ ret_stack->ret);
*ret = (unsigned long)panic;
- return;
+ return NULL;
}
#endif
- *ret = current->ret_stack[index].ret;
- trace->func = current->ret_stack[index].func;
- trace->calltime = current->ret_stack[index].calltime;
+ *offset += FGRAPH_FRAME_OFFSET;
+ *ret = ret_stack->ret;
+ trace->func = ret_stack->func;
+ trace->calltime = ret_stack->calltime;
trace->overrun = atomic_read(&current->trace_overrun);
- trace->depth = current->curr_ret_depth--;
+ trace->depth = current->curr_ret_depth;
/*
* We still want to trace interrupts coming in if
* max_depth is set to 1. Make sure the decrement is
* seen before ftrace_graph_return.
*/
barrier();
+
+ return ret_stack;
}
/*
@@ -236,30 +771,55 @@ struct fgraph_ret_regs;
static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs,
unsigned long frame_pointer)
{
+ struct ftrace_ret_stack *ret_stack;
struct ftrace_graph_ret trace;
+ unsigned long bitmap;
unsigned long ret;
+ int offset;
+ int i;
+
+ ret_stack = ftrace_pop_return_trace(&trace, &ret, frame_pointer, &offset);
+
+ if (unlikely(!ret_stack)) {
+ ftrace_graph_stop();
+ WARN_ON(1);
+ /* Might as well panic. What else to do? */
+ return (unsigned long)panic;
+ }
- ftrace_pop_return_trace(&trace, &ret, frame_pointer);
+ trace.rettime = trace_clock_local();
#ifdef CONFIG_FUNCTION_GRAPH_RETVAL
trace.retval = fgraph_ret_regs_return_value(ret_regs);
#endif
- trace.rettime = trace_clock_local();
- ftrace_graph_return(&trace);
+
+ bitmap = get_bitmap_bits(current, offset);
+
+#ifdef CONFIG_HAVE_STATIC_CALL
+ if (static_branch_likely(&fgraph_do_direct)) {
+ if (test_bit(fgraph_direct_gops->idx, &bitmap))
+ static_call(fgraph_retfunc)(&trace, fgraph_direct_gops);
+ } else
+#endif
+ {
+ for_each_set_bit(i, &bitmap, sizeof(bitmap) * BITS_PER_BYTE) {
+ struct fgraph_ops *gops = fgraph_array[i];
+
+ if (gops == &fgraph_stub)
+ continue;
+
+ gops->retfunc(&trace, gops);
+ }
+ }
+
/*
* The ftrace_graph_return() may still access the current
* ret_stack structure, we need to make sure the update of
* curr_ret_stack is after that.
*/
barrier();
- current->curr_ret_stack--;
-
- if (unlikely(!ret)) {
- ftrace_graph_stop();
- WARN_ON(1);
- /* Might as well panic. What else to do? */
- ret = (unsigned long)panic;
- }
+ current->curr_ret_stack = offset - FGRAPH_FRAME_OFFSET;
+ current->curr_ret_depth--;
return ret;
}
@@ -282,7 +842,7 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
/**
* ftrace_graph_get_ret_stack - return the entry of the shadow stack
- * @task: The task to read the shadow stack from
+ * @task: The task to read the shadow stack from.
* @idx: Index down the shadow stack
*
* Return the ret_struct on the shadow stack of the @task at the
@@ -294,104 +854,116 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
struct ftrace_ret_stack *
ftrace_graph_get_ret_stack(struct task_struct *task, int idx)
{
- idx = task->curr_ret_stack - idx;
+ struct ftrace_ret_stack *ret_stack = NULL;
+ int offset = task->curr_ret_stack;
- if (idx >= 0 && idx <= task->curr_ret_stack)
- return &task->ret_stack[idx];
+ if (offset < 0)
+ return NULL;
- return NULL;
+ do {
+ ret_stack = get_ret_stack(task, offset, &offset);
+ } while (ret_stack && --idx >= 0);
+
+ return ret_stack;
}
/**
- * ftrace_graph_ret_addr - convert a potentially modified stack return address
- * to its original value
+ * ftrace_graph_ret_addr - return the original value of the return address
+ * @task: The task the unwinder is being executed on
+ * @idx: An initialized pointer to the next stack index to use
+ * @ret: The current return address (likely pointing to return_handler)
+ * @retp: The address on the stack of the current return location
*
* This function can be called by stack unwinding code to convert a found stack
- * return address ('ret') to its original value, in case the function graph
+ * return address (@ret) to its original value, in case the function graph
* tracer has modified it to be 'return_to_handler'. If the address hasn't
- * been modified, the unchanged value of 'ret' is returned.
+ * been modified, the unchanged value of @ret is returned.
*
- * 'idx' is a state variable which should be initialized by the caller to zero
- * before the first call.
+ * @idx holds the last index used to know where to start from. It should be
+ * initialized to zero for the first iteration as that will mean to start
+ * at the top of the shadow stack. If the location is found, this pointer
+ * will be assigned that location so that if called again, it will continue
+ * where it left off.
*
- * 'retp' is a pointer to the return address on the stack. It's ignored if
- * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined.
+ * @retp is a pointer to the return address on the stack.
*/
-#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
unsigned long ret, unsigned long *retp)
{
- int index = task->curr_ret_stack;
- int i;
+ struct ftrace_ret_stack *ret_stack;
+ unsigned long return_handler = (unsigned long)dereference_kernel_function_descriptor(return_to_handler);
+ int i = task->curr_ret_stack;
- if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler))
+ if (ret != return_handler)
return ret;
- if (index < 0)
+ if (!idx)
return ret;
- for (i = 0; i <= index; i++)
- if (task->ret_stack[i].retp == retp)
- return task->ret_stack[i].ret;
+ i = *idx ? : task->curr_ret_stack;
+ while (i > 0) {
+ ret_stack = get_ret_stack(current, i, &i);
+ if (!ret_stack)
+ break;
+ /*
+ * For the tail-call, there would be 2 or more ftrace_ret_stacks on
+ * the ret_stack, which records "return_to_handler" as the return
+ * address except for the last one.
+ * But on the real stack, there should be 1 entry because tail-call
+ * reuses the return address on the stack and jump to the next function.
+ * Thus we will continue to find real return address.
+ */
+ if (ret_stack->retp == retp &&
+ ret_stack->ret != return_handler) {
+ *idx = i;
+ return ret_stack->ret;
+ }
+ }
return ret;
}
-#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
-unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
- unsigned long ret, unsigned long *retp)
-{
- int task_idx;
-
- if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler))
- return ret;
-
- task_idx = task->curr_ret_stack;
-
- if (!task->ret_stack || task_idx < *idx)
- return ret;
-
- task_idx -= *idx;
- (*idx)++;
-
- return task->ret_stack[task_idx].ret;
-}
-#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
static struct ftrace_ops graph_ops = {
.func = ftrace_graph_func,
- .flags = FTRACE_OPS_FL_INITIALIZED |
- FTRACE_OPS_FL_PID |
- FTRACE_OPS_GRAPH_STUB,
+ .flags = FTRACE_OPS_GRAPH_STUB,
#ifdef FTRACE_GRAPH_TRAMP_ADDR
.trampoline = FTRACE_GRAPH_TRAMP_ADDR,
/* trampoline_size is only needed for dynamically allocated tramps */
#endif
- ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
};
-void ftrace_graph_sleep_time_control(bool enable)
+void fgraph_init_ops(struct ftrace_ops *dst_ops,
+ struct ftrace_ops *src_ops)
{
- fgraph_sleep_time = enable;
+ dst_ops->flags = FTRACE_OPS_FL_PID | FTRACE_OPS_GRAPH_STUB;
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+ if (src_ops) {
+ dst_ops->func_hash = &src_ops->local_hash;
+ mutex_init(&dst_ops->local_hash.regex_lock);
+ INIT_LIST_HEAD(&dst_ops->subop_list);
+ dst_ops->flags |= FTRACE_OPS_FL_INITIALIZED;
+ }
+#endif
}
-int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
+void ftrace_graph_sleep_time_control(bool enable)
{
- return 0;
+ fgraph_sleep_time = enable;
}
/*
* Simply points to ftrace_stub, but with the proper protocol.
* Defined by the linker script in linux/vmlinux.lds.h
*/
-extern void ftrace_stub_graph(struct ftrace_graph_ret *);
+void ftrace_stub_graph(struct ftrace_graph_ret *trace, struct fgraph_ops *gops);
/* The callbacks that hook a function */
trace_func_graph_ret_t ftrace_graph_return = ftrace_stub_graph;
trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
-static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub;
/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
-static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
+static int alloc_retstack_tasklist(unsigned long **ret_stack_list)
{
int i;
int ret = 0;
@@ -399,10 +971,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
struct task_struct *g, *t;
for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
- ret_stack_list[i] =
- kmalloc_array(FTRACE_RETFUNC_DEPTH,
- sizeof(struct ftrace_ret_stack),
- GFP_KERNEL);
+ ret_stack_list[i] = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL);
if (!ret_stack_list[i]) {
start = 0;
end = i;
@@ -420,9 +989,10 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
if (t->ret_stack == NULL) {
atomic_set(&t->trace_overrun, 0);
- t->curr_ret_stack = -1;
+ ret_stack_init_task_vars(ret_stack_list[start]);
+ t->curr_ret_stack = 0;
t->curr_ret_depth = -1;
- /* Make sure the tasks see the -1 first: */
+ /* Make sure the tasks see the 0 first: */
smp_wmb();
t->ret_stack = ret_stack_list[start++];
}
@@ -442,8 +1012,9 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
struct task_struct *next,
unsigned int prev_state)
{
+ struct ftrace_ret_stack *ret_stack;
unsigned long long timestamp;
- int index;
+ int offset;
/*
* Does the user want to count the time a function was asleep.
@@ -466,57 +1037,23 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
*/
timestamp -= next->ftrace_timestamp;
- for (index = next->curr_ret_stack; index >= 0; index--)
- next->ret_stack[index].calltime += timestamp;
-}
-
-static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
-{
- if (!ftrace_ops_test(&global_ops, trace->func, NULL))
- return 0;
- return __ftrace_graph_entry(trace);
-}
-
-/*
- * The function graph tracer should only trace the functions defined
- * by set_ftrace_filter and set_ftrace_notrace. If another function
- * tracer ops is registered, the graph tracer requires testing the
- * function against the global ops, and not just trace any function
- * that any ftrace_ops registered.
- */
-void update_function_graph_func(void)
-{
- struct ftrace_ops *op;
- bool do_test = false;
-
- /*
- * The graph and global ops share the same set of functions
- * to test. If any other ops is on the list, then
- * the graph tracing needs to test if its the function
- * it should call.
- */
- do_for_each_ftrace_op(op, ftrace_ops_list) {
- if (op != &global_ops && op != &graph_ops &&
- op != &ftrace_list_end) {
- do_test = true;
- /* in double loop, break out with goto */
- goto out;
- }
- } while_for_each_ftrace_op(op);
- out:
- if (do_test)
- ftrace_graph_entry = ftrace_graph_entry_test;
- else
- ftrace_graph_entry = __ftrace_graph_entry;
+ for (offset = next->curr_ret_stack; offset > 0; ) {
+ ret_stack = get_ret_stack(next, offset, &offset);
+ if (ret_stack)
+ ret_stack->calltime += timestamp;
+ }
}
-static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
+static DEFINE_PER_CPU(unsigned long *, idle_ret_stack);
static void
-graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
+graph_init_task(struct task_struct *t, unsigned long *ret_stack)
{
atomic_set(&t->trace_overrun, 0);
+ ret_stack_init_task_vars(ret_stack);
t->ftrace_timestamp = 0;
+ t->curr_ret_stack = 0;
+ t->curr_ret_depth = -1;
/* make curr_ret_stack visible before we add the ret_stack */
smp_wmb();
t->ret_stack = ret_stack;
@@ -528,7 +1065,7 @@ graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
*/
void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
{
- t->curr_ret_stack = -1;
+ t->curr_ret_stack = 0;
t->curr_ret_depth = -1;
/*
* The idle task has no parent, it either has its own
@@ -538,14 +1075,11 @@ void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
if (ftrace_graph_active) {
- struct ftrace_ret_stack *ret_stack;
+ unsigned long *ret_stack;
ret_stack = per_cpu(idle_ret_stack, cpu);
if (!ret_stack) {
- ret_stack =
- kmalloc_array(FTRACE_RETFUNC_DEPTH,
- sizeof(struct ftrace_ret_stack),
- GFP_KERNEL);
+ ret_stack = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL);
if (!ret_stack)
return;
per_cpu(idle_ret_stack, cpu) = ret_stack;
@@ -559,15 +1093,13 @@ void ftrace_graph_init_task(struct task_struct *t)
{
/* Make sure we do not use the parent ret_stack */
t->ret_stack = NULL;
- t->curr_ret_stack = -1;
+ t->curr_ret_stack = 0;
t->curr_ret_depth = -1;
if (ftrace_graph_active) {
- struct ftrace_ret_stack *ret_stack;
+ unsigned long *ret_stack;
- ret_stack = kmalloc_array(FTRACE_RETFUNC_DEPTH,
- sizeof(struct ftrace_ret_stack),
- GFP_KERNEL);
+ ret_stack = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL);
if (!ret_stack)
return;
graph_init_task(t, ret_stack);
@@ -576,7 +1108,7 @@ void ftrace_graph_init_task(struct task_struct *t)
void ftrace_graph_exit_task(struct task_struct *t)
{
- struct ftrace_ret_stack *ret_stack = t->ret_stack;
+ unsigned long *ret_stack = t->ret_stack;
t->ret_stack = NULL;
/* NULL must become visible to IRQs before we free it: */
@@ -585,15 +1117,52 @@ void ftrace_graph_exit_task(struct task_struct *t)
kfree(ret_stack);
}
+#ifdef CONFIG_DYNAMIC_FTRACE
+static int fgraph_pid_func(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops)
+{
+ struct trace_array *tr = gops->ops.private;
+ int pid;
+
+ if (tr) {
+ pid = this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid);
+ if (pid == FTRACE_PID_IGNORE)
+ return 0;
+ if (pid != FTRACE_PID_TRACE &&
+ pid != current->pid)
+ return 0;
+ }
+
+ return gops->saved_func(trace, gops);
+}
+
+void fgraph_update_pid_func(void)
+{
+ struct fgraph_ops *gops;
+ struct ftrace_ops *op;
+
+ if (!(graph_ops.flags & FTRACE_OPS_FL_INITIALIZED))
+ return;
+
+ list_for_each_entry(op, &graph_ops.subop_list, list) {
+ if (op->flags & FTRACE_OPS_FL_PID) {
+ gops = container_of(op, struct fgraph_ops, ops);
+ gops->entryfunc = ftrace_pids_enabled(op) ?
+ fgraph_pid_func : gops->saved_func;
+ if (ftrace_graph_active == 1)
+ static_call_update(fgraph_func, gops->entryfunc);
+ }
+ }
+}
+#endif
+
/* Allocate a return stack for each task */
static int start_graph_tracing(void)
{
- struct ftrace_ret_stack **ret_stack_list;
+ unsigned long **ret_stack_list;
int ret, cpu;
- ret_stack_list = kmalloc_array(FTRACE_RETSTACK_ALLOC_SIZE,
- sizeof(struct ftrace_ret_stack *),
- GFP_KERNEL);
+ ret_stack_list = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL);
if (!ret_stack_list)
return -ENOMEM;
@@ -619,40 +1188,111 @@ static int start_graph_tracing(void)
return ret;
}
+static void init_task_vars(int idx)
+{
+ struct task_struct *g, *t;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ if (idle_task(cpu)->ret_stack)
+ ret_stack_set_task_var(idle_task(cpu), idx, 0);
+ }
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, t) {
+ if (t->ret_stack)
+ ret_stack_set_task_var(t, idx, 0);
+ }
+ read_unlock(&tasklist_lock);
+}
+
+static void ftrace_graph_enable_direct(bool enable_branch)
+{
+ trace_func_graph_ent_t func = NULL;
+ trace_func_graph_ret_t retfunc = NULL;
+ int i;
+
+ for_each_set_bit(i, &fgraph_array_bitmask,
+ sizeof(fgraph_array_bitmask) * BITS_PER_BYTE) {
+ func = fgraph_array[i]->entryfunc;
+ retfunc = fgraph_array[i]->retfunc;
+ fgraph_direct_gops = fgraph_array[i];
+ }
+ if (WARN_ON_ONCE(!func))
+ return;
+
+ static_call_update(fgraph_func, func);
+ static_call_update(fgraph_retfunc, retfunc);
+ if (enable_branch)
+ static_branch_disable(&fgraph_do_direct);
+}
+
+static void ftrace_graph_disable_direct(bool disable_branch)
+{
+ if (disable_branch)
+ static_branch_disable(&fgraph_do_direct);
+ static_call_update(fgraph_func, ftrace_graph_entry_stub);
+ static_call_update(fgraph_retfunc, ftrace_graph_ret_stub);
+ fgraph_direct_gops = &fgraph_stub;
+}
+
int register_ftrace_graph(struct fgraph_ops *gops)
{
+ int command = 0;
int ret = 0;
+ int i = -1;
mutex_lock(&ftrace_lock);
- /* we currently allow only one tracer registered at a time */
- if (ftrace_graph_active) {
- ret = -EBUSY;
+ if (!fgraph_array[0]) {
+ /* The array must always have real data on it */
+ for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)
+ fgraph_array[i] = &fgraph_stub;
+ fgraph_lru_init();
+ }
+
+ i = fgraph_lru_alloc_index();
+ if (i < 0 || WARN_ON_ONCE(fgraph_array[i] != &fgraph_stub)) {
+ ret = -ENOSPC;
goto out;
}
- register_pm_notifier(&ftrace_suspend_notifier);
+ fgraph_array[i] = gops;
+ gops->idx = i;
ftrace_graph_active++;
- ret = start_graph_tracing();
- if (ret) {
- ftrace_graph_active--;
- goto out;
- }
- ftrace_graph_return = gops->retfunc;
+ if (ftrace_graph_active == 2)
+ ftrace_graph_disable_direct(true);
- /*
- * Update the indirect function to the entryfunc, and the
- * function that gets called to the entry_test first. Then
- * call the update fgraph entry function to determine if
- * the entryfunc should be called directly or not.
- */
- __ftrace_graph_entry = gops->entryfunc;
- ftrace_graph_entry = ftrace_graph_entry_test;
- update_function_graph_func();
+ if (ftrace_graph_active == 1) {
+ ftrace_graph_enable_direct(false);
+ register_pm_notifier(&ftrace_suspend_notifier);
+ ret = start_graph_tracing();
+ if (ret)
+ goto error;
+ /*
+ * Some archs just test to see if these are not
+ * the default function
+ */
+ ftrace_graph_return = return_run;
+ ftrace_graph_entry = entry_run;
+ command = FTRACE_START_FUNC_RET;
+ } else {
+ init_task_vars(gops->idx);
+ }
+
+ /* Always save the function, and reset at unregistering */
+ gops->saved_func = gops->entryfunc;
- ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
+ ret = ftrace_startup_subops(&graph_ops, &gops->ops, command);
+error:
+ if (ret) {
+ fgraph_array[i] = &fgraph_stub;
+ ftrace_graph_active--;
+ gops->saved_func = NULL;
+ fgraph_lru_release_index(i);
+ }
out:
mutex_unlock(&ftrace_lock);
return ret;
@@ -660,19 +1300,41 @@ out:
void unregister_ftrace_graph(struct fgraph_ops *gops)
{
+ int command = 0;
+
mutex_lock(&ftrace_lock);
if (unlikely(!ftrace_graph_active))
goto out;
+ if (unlikely(gops->idx < 0 || gops->idx >= FGRAPH_ARRAY_SIZE ||
+ fgraph_array[gops->idx] != gops))
+ goto out;
+
+ if (fgraph_lru_release_index(gops->idx) < 0)
+ goto out;
+
+ fgraph_array[gops->idx] = &fgraph_stub;
+
ftrace_graph_active--;
- ftrace_graph_return = ftrace_stub_graph;
- ftrace_graph_entry = ftrace_graph_entry_stub;
- __ftrace_graph_entry = ftrace_graph_entry_stub;
- ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET);
- unregister_pm_notifier(&ftrace_suspend_notifier);
- unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
+ if (!ftrace_graph_active)
+ command = FTRACE_STOP_FUNC_RET;
+
+ ftrace_shutdown_subops(&graph_ops, &gops->ops, command);
+
+ if (ftrace_graph_active == 1)
+ ftrace_graph_enable_direct(true);
+ else if (!ftrace_graph_active)
+ ftrace_graph_disable_direct(false);
+
+ if (!ftrace_graph_active) {
+ ftrace_graph_return = ftrace_stub_graph;
+ ftrace_graph_entry = ftrace_graph_entry_stub;
+ unregister_pm_notifier(&ftrace_suspend_notifier);
+ unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
+ }
out:
+ gops->saved_func = NULL;
mutex_unlock(&ftrace_lock);
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index eacab4020508..4c28dd177ca6 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -74,7 +74,8 @@
#ifdef CONFIG_DYNAMIC_FTRACE
#define INIT_OPS_HASH(opsname) \
.func_hash = &opsname.local_hash, \
- .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
+ .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), \
+ .subop_list = LIST_HEAD_INIT(opsname.subop_list),
#else
#define INIT_OPS_HASH(opsname)
#endif
@@ -99,7 +100,7 @@ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
/* What to set function_trace_op to */
static struct ftrace_ops *set_function_trace_op;
-static bool ftrace_pids_enabled(struct ftrace_ops *ops)
+bool ftrace_pids_enabled(struct ftrace_ops *ops)
{
struct trace_array *tr;
@@ -121,7 +122,7 @@ static int ftrace_disabled __read_mostly;
DEFINE_MUTEX(ftrace_lock);
-struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end;
+struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = (struct ftrace_ops __rcu *)&ftrace_list_end;
ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
struct ftrace_ops global_ops;
@@ -161,12 +162,14 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops)
#ifdef CONFIG_DYNAMIC_FTRACE
if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) {
mutex_init(&ops->local_hash.regex_lock);
+ INIT_LIST_HEAD(&ops->subop_list);
ops->func_hash = &ops->local_hash;
ops->flags |= FTRACE_OPS_FL_INITIALIZED;
}
#endif
}
+/* Call this function for when a callback filters on set_ftrace_pid */
static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs)
{
@@ -235,8 +238,6 @@ static void update_ftrace_function(void)
func = ftrace_ops_list_func;
}
- update_function_graph_func();
-
/* If there's no change, then do nothing more here */
if (ftrace_trace_function == func)
return;
@@ -310,7 +311,7 @@ static int remove_ftrace_ops(struct ftrace_ops __rcu **list,
lockdep_is_held(&ftrace_lock)) == ops &&
rcu_dereference_protected(ops->next,
lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) {
- *list = &ftrace_list_end;
+ rcu_assign_pointer(*list, &ftrace_list_end);
return 0;
}
@@ -406,6 +407,8 @@ static void ftrace_update_pid_func(void)
}
} while_for_each_ftrace_op(op);
+ fgraph_update_pid_func();
+
update_ftrace_function();
}
@@ -817,7 +820,8 @@ void ftrace_graph_graph_time_control(bool enable)
fgraph_graph_time = enable;
}
-static int profile_graph_entry(struct ftrace_graph_ent *trace)
+static int profile_graph_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops)
{
struct ftrace_ret_stack *ret_stack;
@@ -834,7 +838,8 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace)
return 1;
}
-static void profile_graph_return(struct ftrace_graph_ret *trace)
+static void profile_graph_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops)
{
struct ftrace_ret_stack *ret_stack;
struct ftrace_profile_stat *stat;
@@ -1314,7 +1319,7 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
return hash;
}
-
+/* Used to save filters on functions for modules not loaded yet */
static int ftrace_add_mod(struct trace_array *tr,
const char *func, const char *module,
int enable)
@@ -1380,15 +1385,17 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
return NULL;
}
-static void
-ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash);
-static void
-ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
+static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops);
+static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops);
static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
struct ftrace_hash *new_hash);
-static struct ftrace_hash *dup_hash(struct ftrace_hash *src, int size)
+/*
+ * Allocate a new hash and remove entries from @src and move them to the new hash.
+ * On success, the @src hash will be empty and should be freed.
+ */
+static struct ftrace_hash *__move_hash(struct ftrace_hash *src, int size)
{
struct ftrace_func_entry *entry;
struct ftrace_hash *new_hash;
@@ -1424,6 +1431,7 @@ static struct ftrace_hash *dup_hash(struct ftrace_hash *src, int size)
return new_hash;
}
+/* Move the @src entries to a newly allocated hash */
static struct ftrace_hash *
__ftrace_hash_move(struct ftrace_hash *src)
{
@@ -1435,9 +1443,29 @@ __ftrace_hash_move(struct ftrace_hash *src)
if (ftrace_hash_empty(src))
return EMPTY_HASH;
- return dup_hash(src, size);
+ return __move_hash(src, size);
}
+/**
+ * ftrace_hash_move - move a new hash to a filter and do updates
+ * @ops: The ops with the hash that @dst points to
+ * @enable: True if for the filter hash, false for the notrace hash
+ * @dst: Points to the @ops hash that should be updated
+ * @src: The hash to update @dst with
+ *
+ * This is called when an ftrace_ops hash is being updated and the
+ * the kernel needs to reflect this. Note, this only updates the kernel
+ * function callbacks if the @ops is enabled (not to be confused with
+ * @enable above). If the @ops is enabled, its hash determines what
+ * callbacks get called. This function gets called when the @ops hash
+ * is updated and it requires new callbacks.
+ *
+ * On success the elements of @src is moved to @dst, and @dst is updated
+ * properly, as well as the functions determined by the @ops hashes
+ * are now calling the @ops callback function.
+ *
+ * Regardless of return type, @src should be freed with free_ftrace_hash().
+ */
static int
ftrace_hash_move(struct ftrace_ops *ops, int enable,
struct ftrace_hash **dst, struct ftrace_hash *src)
@@ -1467,11 +1495,11 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
* Remove the current set, update the hash and add
* them back.
*/
- ftrace_hash_rec_disable_modify(ops, enable);
+ ftrace_hash_rec_disable_modify(ops);
rcu_assign_pointer(*dst, new_hash);
- ftrace_hash_rec_enable_modify(ops, enable);
+ ftrace_hash_rec_enable_modify(ops);
return 0;
}
@@ -1694,12 +1722,21 @@ static bool skip_record(struct dyn_ftrace *rec)
!(rec->flags & FTRACE_FL_ENABLED);
}
+/*
+ * This is the main engine to the ftrace updates to the dyn_ftrace records.
+ *
+ * It will iterate through all the available ftrace functions
+ * (the ones that ftrace can have callbacks to) and set the flags
+ * in the associated dyn_ftrace records.
+ *
+ * @inc: If true, the functions associated to @ops are added to
+ * the dyn_ftrace records, otherwise they are removed.
+ */
static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
- int filter_hash,
bool inc)
{
struct ftrace_hash *hash;
- struct ftrace_hash *other_hash;
+ struct ftrace_hash *notrace_hash;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
bool update = false;
@@ -1711,35 +1748,16 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
return false;
/*
- * In the filter_hash case:
* If the count is zero, we update all records.
* Otherwise we just update the items in the hash.
- *
- * In the notrace_hash case:
- * We enable the update in the hash.
- * As disabling notrace means enabling the tracing,
- * and enabling notrace means disabling, the inc variable
- * gets inversed.
*/
- if (filter_hash) {
- hash = ops->func_hash->filter_hash;
- other_hash = ops->func_hash->notrace_hash;
- if (ftrace_hash_empty(hash))
- all = true;
- } else {
- inc = !inc;
- hash = ops->func_hash->notrace_hash;
- other_hash = ops->func_hash->filter_hash;
- /*
- * If the notrace hash has no items,
- * then there's nothing to do.
- */
- if (ftrace_hash_empty(hash))
- return false;
- }
+ hash = ops->func_hash->filter_hash;
+ notrace_hash = ops->func_hash->notrace_hash;
+ if (ftrace_hash_empty(hash))
+ all = true;
do_for_each_ftrace_rec(pg, rec) {
- int in_other_hash = 0;
+ int in_notrace_hash = 0;
int in_hash = 0;
int match = 0;
@@ -1751,26 +1769,17 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
* Only the filter_hash affects all records.
* Update if the record is not in the notrace hash.
*/
- if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip))
+ if (!notrace_hash || !ftrace_lookup_ip(notrace_hash, rec->ip))
match = 1;
} else {
in_hash = !!ftrace_lookup_ip(hash, rec->ip);
- in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip);
+ in_notrace_hash = !!ftrace_lookup_ip(notrace_hash, rec->ip);
/*
- * If filter_hash is set, we want to match all functions
- * that are in the hash but not in the other hash.
- *
- * If filter_hash is not set, then we are decrementing.
- * That means we match anything that is in the hash
- * and also in the other_hash. That is, we need to turn
- * off functions in the other hash because they are disabled
- * by this hash.
+ * We want to match all functions that are in the hash but
+ * not in the other hash.
*/
- if (filter_hash && in_hash && !in_other_hash)
- match = 1;
- else if (!filter_hash && in_hash &&
- (in_other_hash || ftrace_hash_empty(other_hash)))
+ if (in_hash && !in_notrace_hash)
match = 1;
}
if (!match)
@@ -1876,24 +1885,48 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
return update;
}
-static bool ftrace_hash_rec_disable(struct ftrace_ops *ops,
- int filter_hash)
+/*
+ * This is called when an ops is removed from tracing. It will decrement
+ * the counters of the dyn_ftrace records for all the functions that
+ * the @ops attached to.
+ */
+static bool ftrace_hash_rec_disable(struct ftrace_ops *ops)
{
- return __ftrace_hash_rec_update(ops, filter_hash, 0);
+ return __ftrace_hash_rec_update(ops, false);
}
-static bool ftrace_hash_rec_enable(struct ftrace_ops *ops,
- int filter_hash)
+/*
+ * This is called when an ops is added to tracing. It will increment
+ * the counters of the dyn_ftrace records for all the functions that
+ * the @ops attached to.
+ */
+static bool ftrace_hash_rec_enable(struct ftrace_ops *ops)
{
- return __ftrace_hash_rec_update(ops, filter_hash, 1);
+ return __ftrace_hash_rec_update(ops, true);
}
-static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops,
- int filter_hash, int inc)
+/*
+ * This function will update what functions @ops traces when its filter
+ * changes.
+ *
+ * The @inc states if the @ops callbacks are going to be added or removed.
+ * When one of the @ops hashes are updated to a "new_hash" the dyn_ftrace
+ * records are update via:
+ *
+ * ftrace_hash_rec_disable_modify(ops);
+ * ops->hash = new_hash
+ * ftrace_hash_rec_enable_modify(ops);
+ *
+ * Where the @ops is removed from all the records it is tracing using
+ * its old hash. The @ops hash is updated to the new hash, and then
+ * the @ops is added back to the records so that it is tracing all
+ * the new functions.
+ */
+static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops, bool inc)
{
struct ftrace_ops *op;
- __ftrace_hash_rec_update(ops, filter_hash, inc);
+ __ftrace_hash_rec_update(ops, inc);
if (ops->func_hash != &global_ops.local_hash)
return;
@@ -1907,20 +1940,18 @@ static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops,
if (op == ops)
continue;
if (op->func_hash == &global_ops.local_hash)
- __ftrace_hash_rec_update(op, filter_hash, inc);
+ __ftrace_hash_rec_update(op, inc);
} while_for_each_ftrace_op(op);
}
-static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops,
- int filter_hash)
+static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops)
{
- ftrace_hash_rec_update_modify(ops, filter_hash, 0);
+ ftrace_hash_rec_update_modify(ops, false);
}
-static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
- int filter_hash)
+static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops)
{
- ftrace_hash_rec_update_modify(ops, filter_hash, 1);
+ ftrace_hash_rec_update_modify(ops, true);
}
/*
@@ -3043,7 +3074,7 @@ int ftrace_startup(struct ftrace_ops *ops, int command)
return ret;
}
- if (ftrace_hash_rec_enable(ops, 1))
+ if (ftrace_hash_rec_enable(ops))
command |= FTRACE_UPDATE_CALLS;
ftrace_startup_enable(command);
@@ -3085,7 +3116,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
/* Disabling ipmodify never fails */
ftrace_hash_ipmodify_disable(ops);
- if (ftrace_hash_rec_disable(ops, 1))
+ if (ftrace_hash_rec_disable(ops))
command |= FTRACE_UPDATE_CALLS;
ops->flags &= ~FTRACE_OPS_FL_ENABLED;
@@ -3164,6 +3195,474 @@ out:
return 0;
}
+/* Simply make a copy of @src and return it */
+static struct ftrace_hash *copy_hash(struct ftrace_hash *src)
+{
+ if (ftrace_hash_empty(src))
+ return EMPTY_HASH;
+
+ return alloc_and_copy_ftrace_hash(src->size_bits, src);
+}
+
+/*
+ * Append @new_hash entries to @hash:
+ *
+ * If @hash is the EMPTY_HASH then it traces all functions and nothing
+ * needs to be done.
+ *
+ * If @new_hash is the EMPTY_HASH, then make *hash the EMPTY_HASH so
+ * that it traces everything.
+ *
+ * Otherwise, go through all of @new_hash and add anything that @hash
+ * doesn't already have, to @hash.
+ *
+ * The filter_hash updates uses just the append_hash() function
+ * and the notrace_hash does not.
+ */
+static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash)
+{
+ struct ftrace_func_entry *entry;
+ int size;
+ int i;
+
+ /* An empty hash does everything */
+ if (ftrace_hash_empty(*hash))
+ return 0;
+
+ /* If new_hash has everything make hash have everything */
+ if (ftrace_hash_empty(new_hash)) {
+ free_ftrace_hash(*hash);
+ *hash = EMPTY_HASH;
+ return 0;
+ }
+
+ size = 1 << new_hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &new_hash->buckets[i], hlist) {
+ /* Only add if not already in hash */
+ if (!__ftrace_lookup_ip(*hash, entry->ip) &&
+ add_hash_entry(*hash, entry->ip) == NULL)
+ return -ENOMEM;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Add to @hash only those that are in both @new_hash1 and @new_hash2
+ *
+ * The notrace_hash updates uses just the intersect_hash() function
+ * and the filter_hash does not.
+ */
+static int intersect_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash1,
+ struct ftrace_hash *new_hash2)
+{
+ struct ftrace_func_entry *entry;
+ int size;
+ int i;
+
+ /*
+ * If new_hash1 or new_hash2 is the EMPTY_HASH then make the hash
+ * empty as well as empty for notrace means none are notraced.
+ */
+ if (ftrace_hash_empty(new_hash1) || ftrace_hash_empty(new_hash2)) {
+ free_ftrace_hash(*hash);
+ *hash = EMPTY_HASH;
+ return 0;
+ }
+
+ size = 1 << new_hash1->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &new_hash1->buckets[i], hlist) {
+ /* Only add if in both @new_hash1 and @new_hash2 */
+ if (__ftrace_lookup_ip(new_hash2, entry->ip) &&
+ add_hash_entry(*hash, entry->ip) == NULL)
+ return -ENOMEM;
+ }
+ }
+ /* If nothing intersects, make it the empty set */
+ if (ftrace_hash_empty(*hash)) {
+ free_ftrace_hash(*hash);
+ *hash = EMPTY_HASH;
+ }
+ return 0;
+}
+
+/* Return a new hash that has a union of all @ops->filter_hash entries */
+static struct ftrace_hash *append_hashes(struct ftrace_ops *ops)
+{
+ struct ftrace_hash *new_hash;
+ struct ftrace_ops *subops;
+ int ret;
+
+ new_hash = alloc_ftrace_hash(ops->func_hash->filter_hash->size_bits);
+ if (!new_hash)
+ return NULL;
+
+ list_for_each_entry(subops, &ops->subop_list, list) {
+ ret = append_hash(&new_hash, subops->func_hash->filter_hash);
+ if (ret < 0) {
+ free_ftrace_hash(new_hash);
+ return NULL;
+ }
+ /* Nothing more to do if new_hash is empty */
+ if (ftrace_hash_empty(new_hash))
+ break;
+ }
+ return new_hash;
+}
+
+/* Make @ops trace evenything except what all its subops do not trace */
+static struct ftrace_hash *intersect_hashes(struct ftrace_ops *ops)
+{
+ struct ftrace_hash *new_hash = NULL;
+ struct ftrace_ops *subops;
+ int size_bits;
+ int ret;
+
+ list_for_each_entry(subops, &ops->subop_list, list) {
+ struct ftrace_hash *next_hash;
+
+ if (!new_hash) {
+ size_bits = subops->func_hash->notrace_hash->size_bits;
+ new_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->notrace_hash);
+ if (!new_hash)
+ return NULL;
+ continue;
+ }
+ size_bits = new_hash->size_bits;
+ next_hash = new_hash;
+ new_hash = alloc_ftrace_hash(size_bits);
+ ret = intersect_hash(&new_hash, next_hash, subops->func_hash->notrace_hash);
+ free_ftrace_hash(next_hash);
+ if (ret < 0) {
+ free_ftrace_hash(new_hash);
+ return NULL;
+ }
+ /* Nothing more to do if new_hash is empty */
+ if (ftrace_hash_empty(new_hash))
+ break;
+ }
+ return new_hash;
+}
+
+static bool ops_equal(struct ftrace_hash *A, struct ftrace_hash *B)
+{
+ struct ftrace_func_entry *entry;
+ int size;
+ int i;
+
+ if (ftrace_hash_empty(A))
+ return ftrace_hash_empty(B);
+
+ if (ftrace_hash_empty(B))
+ return ftrace_hash_empty(A);
+
+ if (A->count != B->count)
+ return false;
+
+ size = 1 << A->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &A->buckets[i], hlist) {
+ if (!__ftrace_lookup_ip(B, entry->ip))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static void ftrace_ops_update_code(struct ftrace_ops *ops,
+ struct ftrace_ops_hash *old_hash);
+
+static int __ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
+ struct ftrace_hash **orig_hash,
+ struct ftrace_hash *hash,
+ int enable)
+{
+ struct ftrace_ops_hash old_hash_ops;
+ struct ftrace_hash *old_hash;
+ int ret;
+
+ old_hash = *orig_hash;
+ old_hash_ops.filter_hash = ops->func_hash->filter_hash;
+ old_hash_ops.notrace_hash = ops->func_hash->notrace_hash;
+ ret = ftrace_hash_move(ops, enable, orig_hash, hash);
+ if (!ret) {
+ ftrace_ops_update_code(ops, &old_hash_ops);
+ free_ftrace_hash_rcu(old_hash);
+ }
+ return ret;
+}
+
+static int ftrace_update_ops(struct ftrace_ops *ops, struct ftrace_hash *filter_hash,
+ struct ftrace_hash *notrace_hash)
+{
+ int ret;
+
+ if (!ops_equal(filter_hash, ops->func_hash->filter_hash)) {
+ ret = __ftrace_hash_move_and_update_ops(ops, &ops->func_hash->filter_hash,
+ filter_hash, 1);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (!ops_equal(notrace_hash, ops->func_hash->notrace_hash)) {
+ ret = __ftrace_hash_move_and_update_ops(ops, &ops->func_hash->notrace_hash,
+ notrace_hash, 0);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+/**
+ * ftrace_startup_subops - enable tracing for subops of an ops
+ * @ops: Manager ops (used to pick all the functions of its subops)
+ * @subops: A new ops to add to @ops
+ * @command: Extra commands to use to enable tracing
+ *
+ * The @ops is a manager @ops that has the filter that includes all the functions
+ * that its list of subops are tracing. Adding a new @subops will add the
+ * functions of @subops to @ops.
+ */
+int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command)
+{
+ struct ftrace_hash *filter_hash;
+ struct ftrace_hash *notrace_hash;
+ struct ftrace_hash *save_filter_hash;
+ struct ftrace_hash *save_notrace_hash;
+ int size_bits;
+ int ret;
+
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
+ ftrace_ops_init(ops);
+ ftrace_ops_init(subops);
+
+ if (WARN_ON_ONCE(subops->flags & FTRACE_OPS_FL_ENABLED))
+ return -EBUSY;
+
+ /* Make everything canonical (Just in case!) */
+ if (!ops->func_hash->filter_hash)
+ ops->func_hash->filter_hash = EMPTY_HASH;
+ if (!ops->func_hash->notrace_hash)
+ ops->func_hash->notrace_hash = EMPTY_HASH;
+ if (!subops->func_hash->filter_hash)
+ subops->func_hash->filter_hash = EMPTY_HASH;
+ if (!subops->func_hash->notrace_hash)
+ subops->func_hash->notrace_hash = EMPTY_HASH;
+
+ /* For the first subops to ops just enable it normally */
+ if (list_empty(&ops->subop_list)) {
+ /* Just use the subops hashes */
+ filter_hash = copy_hash(subops->func_hash->filter_hash);
+ notrace_hash = copy_hash(subops->func_hash->notrace_hash);
+ if (!filter_hash || !notrace_hash) {
+ free_ftrace_hash(filter_hash);
+ free_ftrace_hash(notrace_hash);
+ return -ENOMEM;
+ }
+
+ save_filter_hash = ops->func_hash->filter_hash;
+ save_notrace_hash = ops->func_hash->notrace_hash;
+
+ ops->func_hash->filter_hash = filter_hash;
+ ops->func_hash->notrace_hash = notrace_hash;
+ list_add(&subops->list, &ops->subop_list);
+ ret = ftrace_startup(ops, command);
+ if (ret < 0) {
+ list_del(&subops->list);
+ ops->func_hash->filter_hash = save_filter_hash;
+ ops->func_hash->notrace_hash = save_notrace_hash;
+ free_ftrace_hash(filter_hash);
+ free_ftrace_hash(notrace_hash);
+ } else {
+ free_ftrace_hash(save_filter_hash);
+ free_ftrace_hash(save_notrace_hash);
+ subops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_SUBOP;
+ subops->managed = ops;
+ }
+ return ret;
+ }
+
+ /*
+ * Here there's already something attached. Here are the rules:
+ * o If either filter_hash is empty then the final stays empty
+ * o Otherwise, the final is a superset of both hashes
+ * o If either notrace_hash is empty then the final stays empty
+ * o Otherwise, the final is an intersection between the hashes
+ */
+ if (ftrace_hash_empty(ops->func_hash->filter_hash) ||
+ ftrace_hash_empty(subops->func_hash->filter_hash)) {
+ filter_hash = EMPTY_HASH;
+ } else {
+ size_bits = max(ops->func_hash->filter_hash->size_bits,
+ subops->func_hash->filter_hash->size_bits);
+ filter_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->filter_hash);
+ if (!filter_hash)
+ return -ENOMEM;
+ ret = append_hash(&filter_hash, subops->func_hash->filter_hash);
+ if (ret < 0) {
+ free_ftrace_hash(filter_hash);
+ return ret;
+ }
+ }
+
+ if (ftrace_hash_empty(ops->func_hash->notrace_hash) ||
+ ftrace_hash_empty(subops->func_hash->notrace_hash)) {
+ notrace_hash = EMPTY_HASH;
+ } else {
+ size_bits = max(ops->func_hash->filter_hash->size_bits,
+ subops->func_hash->filter_hash->size_bits);
+ notrace_hash = alloc_ftrace_hash(size_bits);
+ if (!notrace_hash) {
+ free_ftrace_hash(filter_hash);
+ return -ENOMEM;
+ }
+
+ ret = intersect_hash(&notrace_hash, ops->func_hash->filter_hash,
+ subops->func_hash->filter_hash);
+ if (ret < 0) {
+ free_ftrace_hash(filter_hash);
+ free_ftrace_hash(notrace_hash);
+ return ret;
+ }
+ }
+
+ list_add(&subops->list, &ops->subop_list);
+
+ ret = ftrace_update_ops(ops, filter_hash, notrace_hash);
+ free_ftrace_hash(filter_hash);
+ free_ftrace_hash(notrace_hash);
+ if (ret < 0) {
+ list_del(&subops->list);
+ } else {
+ subops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_SUBOP;
+ subops->managed = ops;
+ }
+ return ret;
+}
+
+/**
+ * ftrace_shutdown_subops - Remove a subops from a manager ops
+ * @ops: A manager ops to remove @subops from
+ * @subops: The subops to remove from @ops
+ * @command: Any extra command flags to add to modifying the text
+ *
+ * Removes the functions being traced by the @subops from @ops. Note, it
+ * will not affect functions that are being traced by other subops that
+ * still exist in @ops.
+ *
+ * If the last subops is removed from @ops, then @ops is shutdown normally.
+ */
+int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command)
+{
+ struct ftrace_hash *filter_hash;
+ struct ftrace_hash *notrace_hash;
+ int ret;
+
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
+ if (WARN_ON_ONCE(!(subops->flags & FTRACE_OPS_FL_ENABLED)))
+ return -EINVAL;
+
+ list_del(&subops->list);
+
+ if (list_empty(&ops->subop_list)) {
+ /* Last one, just disable the current ops */
+
+ ret = ftrace_shutdown(ops, command);
+ if (ret < 0) {
+ list_add(&subops->list, &ops->subop_list);
+ return ret;
+ }
+
+ subops->flags &= ~FTRACE_OPS_FL_ENABLED;
+
+ free_ftrace_hash(ops->func_hash->filter_hash);
+ free_ftrace_hash(ops->func_hash->notrace_hash);
+ ops->func_hash->filter_hash = EMPTY_HASH;
+ ops->func_hash->notrace_hash = EMPTY_HASH;
+ subops->flags &= ~(FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_SUBOP);
+ subops->managed = NULL;
+
+ return 0;
+ }
+
+ /* Rebuild the hashes without subops */
+ filter_hash = append_hashes(ops);
+ notrace_hash = intersect_hashes(ops);
+ if (!filter_hash || !notrace_hash) {
+ free_ftrace_hash(filter_hash);
+ free_ftrace_hash(notrace_hash);
+ list_add(&subops->list, &ops->subop_list);
+ return -ENOMEM;
+ }
+
+ ret = ftrace_update_ops(ops, filter_hash, notrace_hash);
+ if (ret < 0) {
+ list_add(&subops->list, &ops->subop_list);
+ } else {
+ subops->flags &= ~(FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_SUBOP);
+ subops->managed = NULL;
+ }
+ free_ftrace_hash(filter_hash);
+ free_ftrace_hash(notrace_hash);
+ return ret;
+}
+
+static int ftrace_hash_move_and_update_subops(struct ftrace_ops *subops,
+ struct ftrace_hash **orig_subhash,
+ struct ftrace_hash *hash,
+ int enable)
+{
+ struct ftrace_ops *ops = subops->managed;
+ struct ftrace_hash **orig_hash;
+ struct ftrace_hash *save_hash;
+ struct ftrace_hash *new_hash;
+ int ret;
+
+ /* Manager ops can not be subops (yet) */
+ if (WARN_ON_ONCE(!ops || ops->flags & FTRACE_OPS_FL_SUBOP))
+ return -EINVAL;
+
+ /* Move the new hash over to the subops hash */
+ save_hash = *orig_subhash;
+ *orig_subhash = __ftrace_hash_move(hash);
+ if (!*orig_subhash) {
+ *orig_subhash = save_hash;
+ return -ENOMEM;
+ }
+
+ /* Create a new_hash to hold the ops new functions */
+ if (enable) {
+ orig_hash = &ops->func_hash->filter_hash;
+ new_hash = append_hashes(ops);
+ } else {
+ orig_hash = &ops->func_hash->notrace_hash;
+ new_hash = intersect_hashes(ops);
+ }
+
+ /* Move the hash over to the new hash */
+ ret = __ftrace_hash_move_and_update_ops(ops, orig_hash, new_hash, enable);
+
+ free_ftrace_hash(new_hash);
+
+ if (ret) {
+ /* Put back the original hash */
+ free_ftrace_hash_rcu(*orig_subhash);
+ *orig_subhash = save_hash;
+ } else {
+ free_ftrace_hash_rcu(save_hash);
+ }
+ return ret;
+}
+
+
static u64 ftrace_update_time;
unsigned long ftrace_update_tot_cnt;
unsigned long ftrace_number_of_pages;
@@ -4380,19 +4879,33 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
struct ftrace_hash *hash,
int enable)
{
- struct ftrace_ops_hash old_hash_ops;
- struct ftrace_hash *old_hash;
- int ret;
+ if (ops->flags & FTRACE_OPS_FL_SUBOP)
+ return ftrace_hash_move_and_update_subops(ops, orig_hash, hash, enable);
- old_hash = *orig_hash;
- old_hash_ops.filter_hash = ops->func_hash->filter_hash;
- old_hash_ops.notrace_hash = ops->func_hash->notrace_hash;
- ret = ftrace_hash_move(ops, enable, orig_hash, hash);
- if (!ret) {
- ftrace_ops_update_code(ops, &old_hash_ops);
- free_ftrace_hash_rcu(old_hash);
+ /*
+ * If this ops is not enabled, it could be sharing its filters
+ * with a subop. If that's the case, update the subop instead of
+ * this ops. Shared filters are only allowed to have one ops set
+ * at a time, and if we update the ops that is not enabled,
+ * it will not affect subops that share it.
+ */
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) {
+ struct ftrace_ops *op;
+
+ /* Check if any other manager subops maps to this hash */
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ struct ftrace_ops *subops;
+
+ list_for_each_entry(subops, &op->subop_list, list) {
+ if ((subops->flags & FTRACE_OPS_FL_ENABLED) &&
+ subops->func_hash == ops->func_hash) {
+ return ftrace_hash_move_and_update_subops(subops, orig_hash, hash, enable);
+ }
+ }
+ } while_for_each_ftrace_op(op);
}
- return ret;
+
+ return __ftrace_hash_move_and_update_ops(ops, orig_hash, hash, enable);
}
static bool module_exists(const char *module)
@@ -5475,6 +5988,8 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct);
* unregister_ftrace_direct - Remove calls to custom trampoline
* previously registered by register_ftrace_direct for @ops object.
* @ops: The address of the struct ftrace_ops object
+ * @addr: The address of the direct function that is called by the @ops functions
+ * @free_filters: Set to true to remove all filters for the ftrace_ops, false otherwise
*
* This is used to remove a direct calls to @addr from the nop locations
* of the functions registered in @ops (with by ftrace_set_filter_ip
@@ -7324,6 +7839,7 @@ __init void ftrace_init_global_array_ops(struct trace_array *tr)
tr->ops = &global_ops;
tr->ops->private = tr;
ftrace_init_trace_array(tr);
+ init_array_fgraph_ops(tr, tr->ops);
}
void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func)
@@ -7404,6 +7920,7 @@ out:
void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs)
{
+ kmsan_unpoison_memory(fregs, sizeof(*fregs));
__ftrace_ops_list_func(ip, parent_ip, NULL, fregs);
}
#else
@@ -8218,7 +8735,7 @@ static bool is_permanent_ops_registered(void)
}
static int
-ftrace_enable_sysctl(struct ctl_table *table, int write,
+ftrace_enable_sysctl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = -ENODEV;
diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h
index 5012c04f92c0..3235470e61b3 100644
--- a/kernel/trace/ftrace_internal.h
+++ b/kernel/trace/ftrace_internal.h
@@ -15,6 +15,8 @@ extern struct ftrace_ops global_ops;
int ftrace_startup(struct ftrace_ops *ops, int command);
int ftrace_shutdown(struct ftrace_ops *ops, int command);
int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs);
+int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command);
+int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command);
#else /* !CONFIG_DYNAMIC_FTRACE */
@@ -38,14 +40,26 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
{
return 1;
}
+static inline int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command)
+{
+ return -EINVAL;
+}
+static inline int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command)
+{
+ return -EINVAL;
+}
#endif /* CONFIG_DYNAMIC_FTRACE */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
extern int ftrace_graph_active;
-void update_function_graph_func(void);
+# ifdef CONFIG_DYNAMIC_FTRACE
+extern void fgraph_update_pid_func(void);
+# else
+static inline void fgraph_update_pid_func(void) {}
+# endif
#else /* !CONFIG_FUNCTION_GRAPH_TRACER */
# define ftrace_graph_active 0
-static inline void update_function_graph_func(void) { }
+static inline void fgraph_update_pid_func(void) {}
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
#else /* !CONFIG_FUNCTION_TRACER */
diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
index 95106d02b32d..4966e6bbdf6f 100644
--- a/kernel/trace/pid_list.c
+++ b/kernel/trace/pid_list.c
@@ -354,7 +354,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
while (upper_count-- > 0) {
union upper_chunk *chunk;
- chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+ chunk = kzalloc(sizeof(*chunk), GFP_NOWAIT);
if (!chunk)
break;
*upper_next = chunk;
@@ -365,7 +365,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
while (lower_count-- > 0) {
union lower_chunk *chunk;
- chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+ chunk = kzalloc(sizeof(*chunk), GFP_NOWAIT);
if (!chunk)
break;
*lower_next = chunk;
@@ -451,6 +451,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
/**
* trace_pid_list_free - Frees an allocated pid_list.
+ * @pid_list: The pid list to free.
*
* Frees the memory for a pid_list that was allocated.
*/
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 578a49ff5c32..10cd38bce2f1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2767,7 +2767,7 @@ static void output_printk(struct trace_event_buffer *fbuffer)
raw_spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
}
-int tracepoint_printk_sysctl(struct ctl_table *table, int write,
+int tracepoint_printk_sysctl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp,
loff_t *ppos)
{
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 749a182dab48..8783bebd0562 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -397,6 +397,9 @@ struct trace_array {
struct ftrace_ops *ops;
struct trace_pid_list __rcu *function_pids;
struct trace_pid_list __rcu *function_no_pids;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ struct fgraph_ops *gops;
+#endif
#ifdef CONFIG_DYNAMIC_FTRACE
/* All of these are protected by the ftrace_lock */
struct list_head func_probes;
@@ -679,9 +682,8 @@ void trace_latency_header(struct seq_file *m);
void trace_default_header(struct seq_file *m);
void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
-void trace_graph_return(struct ftrace_graph_ret *trace);
-int trace_graph_entry(struct ftrace_graph_ent *trace);
-void set_graph_array(struct trace_array *tr);
+void trace_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops);
+int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops);
void tracing_start_cmdline_record(void);
void tracing_stop_cmdline_record(void);
@@ -892,12 +894,59 @@ extern int __trace_graph_entry(struct trace_array *tr,
extern void __trace_graph_return(struct trace_array *tr,
struct ftrace_graph_ret *trace,
unsigned int trace_ctx);
+extern void init_array_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops);
+extern int allocate_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops);
+extern void free_fgraph_ops(struct trace_array *tr);
+
+enum {
+ TRACE_GRAPH_FL = 1,
+
+ /*
+ * In the very unlikely case that an interrupt came in
+ * at a start of graph tracing, and we want to trace
+ * the function in that interrupt, the depth can be greater
+ * than zero, because of the preempted start of a previous
+ * trace. In an even more unlikely case, depth could be 2
+ * if a softirq interrupted the start of graph tracing,
+ * followed by an interrupt preempting a start of graph
+ * tracing in the softirq, and depth can even be 3
+ * if an NMI came in at the start of an interrupt function
+ * that preempted a softirq start of a function that
+ * preempted normal context!!!! Luckily, it can't be
+ * greater than 3, so the next two bits are a mask
+ * of what the depth is when we set TRACE_GRAPH_FL
+ */
+
+ TRACE_GRAPH_DEPTH_START_BIT,
+ TRACE_GRAPH_DEPTH_END_BIT,
+
+ /*
+ * To implement set_graph_notrace, if this bit is set, we ignore
+ * function graph tracing of called functions, until the return
+ * function is called to clear it.
+ */
+ TRACE_GRAPH_NOTRACE_BIT,
+};
+
+#define TRACE_GRAPH_NOTRACE (1 << TRACE_GRAPH_NOTRACE_BIT)
+
+static inline unsigned long ftrace_graph_depth(unsigned long *task_var)
+{
+ return (*task_var >> TRACE_GRAPH_DEPTH_START_BIT) & 3;
+}
+
+static inline void ftrace_graph_set_depth(unsigned long *task_var, int depth)
+{
+ *task_var &= ~(3 << TRACE_GRAPH_DEPTH_START_BIT);
+ *task_var |= (depth & 3) << TRACE_GRAPH_DEPTH_START_BIT;
+}
#ifdef CONFIG_DYNAMIC_FTRACE
extern struct ftrace_hash __rcu *ftrace_graph_hash;
extern struct ftrace_hash __rcu *ftrace_graph_notrace_hash;
-static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
+static inline int
+ftrace_graph_addr(unsigned long *task_var, struct ftrace_graph_ent *trace)
{
unsigned long addr = trace->func;
int ret = 0;
@@ -919,13 +968,12 @@ static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
}
if (ftrace_lookup_ip(hash, addr)) {
-
/*
* This needs to be cleared on the return functions
* when the depth is zero.
*/
- trace_recursion_set(TRACE_GRAPH_BIT);
- trace_recursion_set_depth(trace->depth);
+ *task_var |= TRACE_GRAPH_FL;
+ ftrace_graph_set_depth(task_var, trace->depth);
/*
* If no irqs are to be traced, but a set_graph_function
@@ -944,11 +992,14 @@ out:
return ret;
}
-static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace)
+static inline void
+ftrace_graph_addr_finish(struct fgraph_ops *gops, struct ftrace_graph_ret *trace)
{
- if (trace_recursion_test(TRACE_GRAPH_BIT) &&
- trace->depth == trace_recursion_depth())
- trace_recursion_clear(TRACE_GRAPH_BIT);
+ unsigned long *task_var = fgraph_get_task_var(gops);
+
+ if ((*task_var & TRACE_GRAPH_FL) &&
+ trace->depth == ftrace_graph_depth(task_var))
+ *task_var &= ~TRACE_GRAPH_FL;
}
static inline int ftrace_graph_notrace_addr(unsigned long addr)
@@ -974,7 +1025,7 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr)
return ret;
}
#else
-static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
+static inline int ftrace_graph_addr(unsigned long *task_var, struct ftrace_graph_ent *trace)
{
return 1;
}
@@ -983,27 +1034,37 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr)
{
return 0;
}
-static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace)
+static inline void ftrace_graph_addr_finish(struct fgraph_ops *gops, struct ftrace_graph_ret *trace)
{ }
#endif /* CONFIG_DYNAMIC_FTRACE */
extern unsigned int fgraph_max_depth;
-static inline bool ftrace_graph_ignore_func(struct ftrace_graph_ent *trace)
+static inline bool
+ftrace_graph_ignore_func(struct fgraph_ops *gops, struct ftrace_graph_ent *trace)
{
+ unsigned long *task_var = fgraph_get_task_var(gops);
+
/* trace it when it is-nested-in or is a function enabled. */
- return !(trace_recursion_test(TRACE_GRAPH_BIT) ||
- ftrace_graph_addr(trace)) ||
+ return !((*task_var & TRACE_GRAPH_FL) ||
+ ftrace_graph_addr(task_var, trace)) ||
(trace->depth < 0) ||
(fgraph_max_depth && trace->depth >= fgraph_max_depth);
}
+void fgraph_init_ops(struct ftrace_ops *dst_ops,
+ struct ftrace_ops *src_ops);
+
#else /* CONFIG_FUNCTION_GRAPH_TRACER */
static inline enum print_line_t
print_graph_function_flags(struct trace_iterator *iter, u32 flags)
{
return TRACE_TYPE_UNHANDLED;
}
+static inline void free_fgraph_ops(struct trace_array *tr) { }
+/* ftrace_ops may not be defined */
+#define init_array_fgraph_ops(tr, ops) do { } while (0)
+#define allocate_fgraph_ops(tr, ops) ({ 0; })
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
extern struct list_head ftrace_pids;
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 3a2b46847c8b..42b0d998d103 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -2885,7 +2885,7 @@ err:
return -ENODEV;
}
-static int set_max_user_events_sysctl(struct ctl_table *table, int write,
+static int set_max_user_events_sysctl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 9f1bfbe105e8..3b0cea37e029 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -80,6 +80,7 @@ void ftrace_free_ftrace_ops(struct trace_array *tr)
int ftrace_create_function_files(struct trace_array *tr,
struct dentry *parent)
{
+ int ret;
/*
* The top level array uses the "global_ops", and the files are
* created on boot up.
@@ -90,6 +91,12 @@ int ftrace_create_function_files(struct trace_array *tr,
if (!tr->ops)
return -EINVAL;
+ ret = allocate_fgraph_ops(tr, tr->ops);
+ if (ret) {
+ kfree(tr->ops);
+ return ret;
+ }
+
ftrace_create_filter_files(tr->ops, parent);
return 0;
@@ -99,6 +106,7 @@ void ftrace_destroy_function_files(struct trace_array *tr)
{
ftrace_destroy_filter_files(tr->ops);
ftrace_free_ftrace_ops(tr);
+ free_fgraph_ops(tr);
}
static ftrace_func_t select_trace_function(u32 flags_val)
@@ -223,6 +231,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
long disabled;
int cpu;
unsigned int trace_ctx;
+ int skip = STACK_SKIP;
if (unlikely(!tr->function_enabled))
return;
@@ -239,7 +248,11 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
if (likely(disabled == 1)) {
trace_ctx = tracing_gen_ctx_flags(flags);
trace_function(tr, ip, parent_ip, trace_ctx);
- __trace_stack(tr, trace_ctx, STACK_SKIP);
+#ifdef CONFIG_UNWINDER_FRAME_POINTER
+ if (ftrace_pids_enabled(op))
+ skip++;
+#endif
+ __trace_stack(tr, trace_ctx, skip);
}
atomic_dec(&data->disabled);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index c35fbaab2a47..13d0387ac6a6 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -83,8 +83,6 @@ static struct tracer_flags tracer_flags = {
.opts = trace_opts
};
-static struct trace_array *graph_array;
-
/*
* DURATION column is being also used to display IRQ signs,
* following values are used by print_graph_irq and others
@@ -129,9 +127,11 @@ static inline int ftrace_graph_ignore_irqs(void)
return in_hardirq();
}
-int trace_graph_entry(struct ftrace_graph_ent *trace)
+int trace_graph_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops)
{
- struct trace_array *tr = graph_array;
+ unsigned long *task_var = fgraph_get_task_var(gops);
+ struct trace_array *tr = gops->private;
struct trace_array_cpu *data;
unsigned long flags;
unsigned int trace_ctx;
@@ -139,7 +139,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
int ret;
int cpu;
- if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT))
+ if (*task_var & TRACE_GRAPH_NOTRACE)
return 0;
/*
@@ -150,7 +150,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
* returning from the function.
*/
if (ftrace_graph_notrace_addr(trace->func)) {
- trace_recursion_set(TRACE_GRAPH_NOTRACE_BIT);
+ *task_var |= TRACE_GRAPH_NOTRACE_BIT;
/*
* Need to return 1 to have the return called
* that will clear the NOTRACE bit.
@@ -161,7 +161,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
if (!ftrace_trace_task(tr))
return 0;
- if (ftrace_graph_ignore_func(trace))
+ if (ftrace_graph_ignore_func(gops, trace))
return 0;
if (ftrace_graph_ignore_irqs())
@@ -238,19 +238,21 @@ void __trace_graph_return(struct trace_array *tr,
trace_buffer_unlock_commit_nostack(buffer, event);
}
-void trace_graph_return(struct ftrace_graph_ret *trace)
+void trace_graph_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops)
{
- struct trace_array *tr = graph_array;
+ unsigned long *task_var = fgraph_get_task_var(gops);
+ struct trace_array *tr = gops->private;
struct trace_array_cpu *data;
unsigned long flags;
unsigned int trace_ctx;
long disabled;
int cpu;
- ftrace_graph_addr_finish(trace);
+ ftrace_graph_addr_finish(gops, trace);
- if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) {
- trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT);
+ if (*task_var & TRACE_GRAPH_NOTRACE) {
+ *task_var &= ~TRACE_GRAPH_NOTRACE;
return;
}
@@ -266,18 +268,10 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
local_irq_restore(flags);
}
-void set_graph_array(struct trace_array *tr)
-{
- graph_array = tr;
-
- /* Make graph_array visible before we start tracing */
-
- smp_mb();
-}
-
-static void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
+static void trace_graph_thresh_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops)
{
- ftrace_graph_addr_finish(trace);
+ ftrace_graph_addr_finish(gops, trace);
if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) {
trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT);
@@ -288,28 +282,60 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
(trace->rettime - trace->calltime < tracing_thresh))
return;
else
- trace_graph_return(trace);
+ trace_graph_return(trace, gops);
}
-static struct fgraph_ops funcgraph_thresh_ops = {
- .entryfunc = &trace_graph_entry,
- .retfunc = &trace_graph_thresh_return,
-};
-
static struct fgraph_ops funcgraph_ops = {
.entryfunc = &trace_graph_entry,
.retfunc = &trace_graph_return,
};
+int allocate_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops)
+{
+ struct fgraph_ops *gops;
+
+ gops = kzalloc(sizeof(*gops), GFP_KERNEL);
+ if (!gops)
+ return -ENOMEM;
+
+ gops->entryfunc = &trace_graph_entry;
+ gops->retfunc = &trace_graph_return;
+
+ tr->gops = gops;
+ gops->private = tr;
+
+ fgraph_init_ops(&gops->ops, ops);
+
+ return 0;
+}
+
+void free_fgraph_ops(struct trace_array *tr)
+{
+ kfree(tr->gops);
+}
+
+__init void init_array_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops)
+{
+ tr->gops = &funcgraph_ops;
+ funcgraph_ops.private = tr;
+ fgraph_init_ops(&tr->gops->ops, ops);
+}
+
static int graph_trace_init(struct trace_array *tr)
{
int ret;
- set_graph_array(tr);
+ tr->gops->entryfunc = trace_graph_entry;
+
if (tracing_thresh)
- ret = register_ftrace_graph(&funcgraph_thresh_ops);
+ tr->gops->retfunc = trace_graph_thresh_return;
else
- ret = register_ftrace_graph(&funcgraph_ops);
+ tr->gops->retfunc = trace_graph_return;
+
+ /* Make gops functions are visible before we start tracing */
+ smp_mb();
+
+ ret = register_ftrace_graph(tr->gops);
if (ret)
return ret;
tracing_start_cmdline_record();
@@ -320,10 +346,7 @@ static int graph_trace_init(struct trace_array *tr)
static void graph_trace_reset(struct trace_array *tr)
{
tracing_stop_cmdline_record();
- if (tracing_thresh)
- unregister_ftrace_graph(&funcgraph_thresh_ops);
- else
- unregister_ftrace_graph(&funcgraph_ops);
+ unregister_ftrace_graph(tr->gops);
}
static int graph_trace_update_thresh(struct trace_array *tr)
@@ -1362,6 +1385,7 @@ static struct tracer graph_trace __tracer_data = {
.print_header = print_graph_headers,
.flags = &tracer_flags,
.set_flag = func_graph_set_flag,
+ .allow_instances = true,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_function_graph,
#endif
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index ba37f768e2f2..fce064e20570 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -175,7 +175,8 @@ static int irqsoff_display_graph(struct trace_array *tr, int set)
return start_irqsoff_tracer(irqsoff_trace, set);
}
-static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
+static int irqsoff_graph_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops)
{
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
@@ -183,7 +184,7 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
unsigned int trace_ctx;
int ret;
- if (ftrace_graph_ignore_func(trace))
+ if (ftrace_graph_ignore_func(gops, trace))
return 0;
/*
* Do not trace a function if it's filtered by set_graph_notrace.
@@ -205,14 +206,15 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
return ret;
}
-static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
+static void irqsoff_graph_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops)
{
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
unsigned long flags;
unsigned int trace_ctx;
- ftrace_graph_addr_finish(trace);
+ ftrace_graph_addr_finish(gops, trace);
if (!func_prolog_dec(tr, &data, &flags))
return;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 16383247bdbf..61a6da808203 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -678,6 +678,21 @@ end:
}
#ifdef CONFIG_MODULES
+static int validate_module_probe_symbol(const char *modname, const char *symbol);
+
+static int register_module_trace_kprobe(struct module *mod, struct trace_kprobe *tk)
+{
+ const char *p;
+ int ret = 0;
+
+ p = strchr(trace_kprobe_symbol(tk), ':');
+ if (p)
+ ret = validate_module_probe_symbol(module_name(mod), p + 1);
+ if (!ret)
+ ret = __register_trace_kprobe(tk);
+ return ret;
+}
+
/* Module notifier call back, checking event on the module */
static int trace_kprobe_module_callback(struct notifier_block *nb,
unsigned long val, void *data)
@@ -696,7 +711,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb,
if (trace_kprobe_within_module(tk, mod)) {
/* Don't need to check busy - this should have gone. */
__unregister_trace_kprobe(tk);
- ret = __register_trace_kprobe(tk);
+ ret = register_module_trace_kprobe(mod, tk);
if (ret)
pr_warn("Failed to re-register probe %s on %s: %d\n",
trace_probe_name(&tk->tp),
@@ -747,17 +762,81 @@ static int count_mod_symbols(void *data, const char *name, unsigned long unused)
return 0;
}
-static unsigned int number_of_same_symbols(char *func_name)
+static unsigned int number_of_same_symbols(const char *mod, const char *func_name)
{
struct sym_count_ctx ctx = { .count = 0, .name = func_name };
- kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count);
+ if (!mod)
+ kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count);
- module_kallsyms_on_each_symbol(NULL, count_mod_symbols, &ctx);
+ module_kallsyms_on_each_symbol(mod, count_mod_symbols, &ctx);
return ctx.count;
}
+static int validate_module_probe_symbol(const char *modname, const char *symbol)
+{
+ unsigned int count = number_of_same_symbols(modname, symbol);
+
+ if (count > 1) {
+ /*
+ * Users should use ADDR to remove the ambiguity of
+ * using KSYM only.
+ */
+ return -EADDRNOTAVAIL;
+ } else if (count == 0) {
+ /*
+ * We can return ENOENT earlier than when register the
+ * kprobe.
+ */
+ return -ENOENT;
+ }
+ return 0;
+}
+
+#ifdef CONFIG_MODULES
+/* Return NULL if the module is not loaded or under unloading. */
+static struct module *try_module_get_by_name(const char *name)
+{
+ struct module *mod;
+
+ rcu_read_lock_sched();
+ mod = find_module(name);
+ if (mod && !try_module_get(mod))
+ mod = NULL;
+ rcu_read_unlock_sched();
+
+ return mod;
+}
+#else
+#define try_module_get_by_name(name) (NULL)
+#endif
+
+static int validate_probe_symbol(char *symbol)
+{
+ struct module *mod = NULL;
+ char *modname = NULL, *p;
+ int ret = 0;
+
+ p = strchr(symbol, ':');
+ if (p) {
+ modname = symbol;
+ symbol = p + 1;
+ *p = '\0';
+ mod = try_module_get_by_name(modname);
+ if (!mod)
+ goto out;
+ }
+
+ ret = validate_module_probe_symbol(modname, symbol);
+out:
+ if (p)
+ *p = ':';
+ if (mod)
+ module_put(mod);
+ return ret;
+}
+
static int trace_kprobe_entry_handler(struct kretprobe_instance *ri,
struct pt_regs *regs);
@@ -881,6 +960,14 @@ static int __trace_kprobe_create(int argc, const char *argv[])
trace_probe_log_err(0, BAD_PROBE_ADDR);
goto parse_error;
}
+ ret = validate_probe_symbol(symbol);
+ if (ret) {
+ if (ret == -EADDRNOTAVAIL)
+ trace_probe_log_err(0, NON_UNIQ_SYMBOL);
+ else
+ trace_probe_log_err(0, BAD_PROBE_ADDR);
+ goto parse_error;
+ }
if (is_return)
ctx.flags |= TPARG_FL_RETURN;
ret = kprobe_on_func_entry(NULL, symbol, offset);
@@ -893,31 +980,6 @@ static int __trace_kprobe_create(int argc, const char *argv[])
}
}
- if (symbol && !strchr(symbol, ':')) {
- unsigned int count;
-
- count = number_of_same_symbols(symbol);
- if (count > 1) {
- /*
- * Users should use ADDR to remove the ambiguity of
- * using KSYM only.
- */
- trace_probe_log_err(0, NON_UNIQ_SYMBOL);
- ret = -EADDRNOTAVAIL;
-
- goto error;
- } else if (count == 0) {
- /*
- * We can return ENOENT earlier than when register the
- * kprobe.
- */
- trace_probe_log_err(0, BAD_PROBE_ADDR);
- ret = -ENOENT;
-
- goto error;
- }
- }
-
trace_probe_log_set_index(0);
if (event) {
ret = traceprobe_parse_event_name(&event, &group, gbuf,
@@ -1835,21 +1897,9 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
char *event;
if (func) {
- unsigned int count;
-
- count = number_of_same_symbols(func);
- if (count > 1)
- /*
- * Users should use addr to remove the ambiguity of
- * using func only.
- */
- return ERR_PTR(-EADDRNOTAVAIL);
- else if (count == 0)
- /*
- * We can return ENOENT earlier than when register the
- * kprobe.
- */
- return ERR_PTR(-ENOENT);
+ ret = validate_probe_symbol(func);
+ if (ret)
+ return ERR_PTR(ret);
}
/*
@@ -2023,19 +2073,16 @@ static __init int kprobe_trace_self_tests_init(void)
pr_info("Testing kprobe tracing: ");
ret = create_or_delete_trace_kprobe("p:testprobe kprobe_trace_selftest_target $stack $stack0 +0($stack)");
- if (WARN_ON_ONCE(ret)) {
- pr_warn("error on probing function entry.\n");
+ if (WARN_ONCE(ret, "error on probing function entry.")) {
warn++;
} else {
/* Enable trace point */
tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM);
- if (WARN_ON_ONCE(tk == NULL)) {
- pr_warn("error on getting new probe.\n");
+ if (WARN_ONCE(tk == NULL, "error on probing function entry.")) {
warn++;
} else {
file = find_trace_probe_file(tk, top_trace_array());
- if (WARN_ON_ONCE(file == NULL)) {
- pr_warn("error on getting probe file.\n");
+ if (WARN_ONCE(file == NULL, "error on getting probe file.")) {
warn++;
} else
enable_trace_kprobe(
@@ -2044,19 +2091,16 @@ static __init int kprobe_trace_self_tests_init(void)
}
ret = create_or_delete_trace_kprobe("r:testprobe2 kprobe_trace_selftest_target $retval");
- if (WARN_ON_ONCE(ret)) {
- pr_warn("error on probing function return.\n");
+ if (WARN_ONCE(ret, "error on probing function return.")) {
warn++;
} else {
/* Enable trace point */
tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM);
- if (WARN_ON_ONCE(tk == NULL)) {
- pr_warn("error on getting 2nd new probe.\n");
+ if (WARN_ONCE(tk == NULL, "error on getting 2nd new probe.")) {
warn++;
} else {
file = find_trace_probe_file(tk, top_trace_array());
- if (WARN_ON_ONCE(file == NULL)) {
- pr_warn("error on getting probe file.\n");
+ if (WARN_ONCE(file == NULL, "error on getting probe file.")) {
warn++;
} else
enable_trace_kprobe(
@@ -2079,18 +2123,15 @@ static __init int kprobe_trace_self_tests_init(void)
/* Disable trace points before removing it */
tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM);
- if (WARN_ON_ONCE(tk == NULL)) {
- pr_warn("error on getting test probe.\n");
+ if (WARN_ONCE(tk == NULL, "error on getting test probe.")) {
warn++;
} else {
- if (trace_kprobe_nhit(tk) != 1) {
- pr_warn("incorrect number of testprobe hits\n");
+ if (WARN_ONCE(trace_kprobe_nhit(tk) != 1,
+ "incorrect number of testprobe hits."))
warn++;
- }
file = find_trace_probe_file(tk, top_trace_array());
- if (WARN_ON_ONCE(file == NULL)) {
- pr_warn("error on getting probe file.\n");
+ if (WARN_ONCE(file == NULL, "error on getting probe file.")) {
warn++;
} else
disable_trace_kprobe(
@@ -2098,18 +2139,15 @@ static __init int kprobe_trace_self_tests_init(void)
}
tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM);
- if (WARN_ON_ONCE(tk == NULL)) {
- pr_warn("error on getting 2nd test probe.\n");
+ if (WARN_ONCE(tk == NULL, "error on getting 2nd test probe.")) {
warn++;
} else {
- if (trace_kprobe_nhit(tk) != 1) {
- pr_warn("incorrect number of testprobe2 hits\n");
+ if (WARN_ONCE(trace_kprobe_nhit(tk) != 1,
+ "incorrect number of testprobe2 hits."))
warn++;
- }
file = find_trace_probe_file(tk, top_trace_array());
- if (WARN_ON_ONCE(file == NULL)) {
- pr_warn("error on getting probe file.\n");
+ if (WARN_ONCE(file == NULL, "error on getting probe file.")) {
warn++;
} else
disable_trace_kprobe(
@@ -2117,23 +2155,15 @@ static __init int kprobe_trace_self_tests_init(void)
}
ret = create_or_delete_trace_kprobe("-:testprobe");
- if (WARN_ON_ONCE(ret)) {
- pr_warn("error on deleting a probe.\n");
+ if (WARN_ONCE(ret, "error on deleting a probe."))
warn++;
- }
ret = create_or_delete_trace_kprobe("-:testprobe2");
- if (WARN_ON_ONCE(ret)) {
- pr_warn("error on deleting a probe.\n");
+ if (WARN_ONCE(ret, "error on deleting a probe."))
warn++;
- }
+
end:
- ret = dyn_events_release_all(&trace_kprobe_ops);
- if (WARN_ON_ONCE(ret)) {
- pr_warn("error on cleaning up probes.\n");
- warn++;
- }
/*
* Wait for the optimizer work to finish. Otherwise it might fiddle
* with probes in already freed __init text.
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index a8e28f9b9271..66a871553d4a 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -1444,9 +1444,9 @@ static int run_osnoise(void)
save_osn_sample_stats(osn_var, &s);
/*
- * if threshold is 0, use the default value of 5 us.
+ * if threshold is 0, use the default value of 1 us.
*/
- threshold = tracing_thresh ? : 5000;
+ threshold = tracing_thresh ? : 1000;
/*
* Apply PREEMPT and IRQ disabled options.
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0469a04a355f..130ca7e7787e 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -112,14 +112,15 @@ static int wakeup_display_graph(struct trace_array *tr, int set)
return start_func_tracer(tr, set);
}
-static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops)
{
struct trace_array *tr = wakeup_trace;
struct trace_array_cpu *data;
unsigned int trace_ctx;
int ret = 0;
- if (ftrace_graph_ignore_func(trace))
+ if (ftrace_graph_ignore_func(gops, trace))
return 0;
/*
* Do not trace a function if it's filtered by set_graph_notrace.
@@ -141,13 +142,14 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
return ret;
}
-static void wakeup_graph_return(struct ftrace_graph_ret *trace)
+static void wakeup_graph_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops)
{
struct trace_array *tr = wakeup_trace;
struct trace_array_cpu *data;
unsigned int trace_ctx;
- ftrace_graph_addr_finish(trace);
+ ftrace_graph_addr_finish(gops, trace);
if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
return;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index e9c5058a8efd..97f1e4bc47dc 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -756,13 +756,262 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+#define CHAR_NUMBER 123
+#define SHORT_NUMBER 12345
+#define WORD_NUMBER 1234567890
+#define LONG_NUMBER 1234567890123456789LL
+#define ERRSTR_BUFLEN 128
+
+struct fgraph_fixture {
+ struct fgraph_ops gops;
+ int store_size;
+ const char *store_type_name;
+ char error_str_buf[ERRSTR_BUFLEN];
+ char *error_str;
+};
+
+static __init int store_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops)
+{
+ struct fgraph_fixture *fixture = container_of(gops, struct fgraph_fixture, gops);
+ const char *type = fixture->store_type_name;
+ int size = fixture->store_size;
+ void *p;
+
+ p = fgraph_reserve_data(gops->idx, size);
+ if (!p) {
+ snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
+ "Failed to reserve %s\n", type);
+ return 0;
+ }
+
+ switch (size) {
+ case 1:
+ *(char *)p = CHAR_NUMBER;
+ break;
+ case 2:
+ *(short *)p = SHORT_NUMBER;
+ break;
+ case 4:
+ *(int *)p = WORD_NUMBER;
+ break;
+ case 8:
+ *(long long *)p = LONG_NUMBER;
+ break;
+ }
+
+ return 1;
+}
+
+static __init void store_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops)
+{
+ struct fgraph_fixture *fixture = container_of(gops, struct fgraph_fixture, gops);
+ const char *type = fixture->store_type_name;
+ long long expect = 0;
+ long long found = -1;
+ int size;
+ char *p;
+
+ p = fgraph_retrieve_data(gops->idx, &size);
+ if (!p) {
+ snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
+ "Failed to retrieve %s\n", type);
+ return;
+ }
+ if (fixture->store_size > size) {
+ snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
+ "Retrieved size %d is smaller than expected %d\n",
+ size, (int)fixture->store_size);
+ return;
+ }
+
+ switch (fixture->store_size) {
+ case 1:
+ expect = CHAR_NUMBER;
+ found = *(char *)p;
+ break;
+ case 2:
+ expect = SHORT_NUMBER;
+ found = *(short *)p;
+ break;
+ case 4:
+ expect = WORD_NUMBER;
+ found = *(int *)p;
+ break;
+ case 8:
+ expect = LONG_NUMBER;
+ found = *(long long *)p;
+ break;
+ }
+
+ if (found != expect) {
+ snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
+ "%s returned not %lld but %lld\n", type, expect, found);
+ return;
+ }
+ fixture->error_str = NULL;
+}
+
+static int __init init_fgraph_fixture(struct fgraph_fixture *fixture)
+{
+ char *func_name;
+ int len;
+
+ snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
+ "Failed to execute storage %s\n", fixture->store_type_name);
+ fixture->error_str = fixture->error_str_buf;
+
+ func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
+ len = strlen(func_name);
+
+ return ftrace_set_filter(&fixture->gops.ops, func_name, len, 1);
+}
+
+/* Test fgraph storage for each size */
+static int __init test_graph_storage_single(struct fgraph_fixture *fixture)
+{
+ int size = fixture->store_size;
+ int ret;
+
+ pr_cont("PASSED\n");
+ pr_info("Testing fgraph storage of %d byte%s: ", size, str_plural(size));
+
+ ret = init_fgraph_fixture(fixture);
+ if (ret && ret != -ENODEV) {
+ pr_cont("*Could not set filter* ");
+ return -1;
+ }
+
+ ret = register_ftrace_graph(&fixture->gops);
+ if (ret) {
+ pr_warn("Failed to init store_bytes fgraph tracing\n");
+ return -1;
+ }
+
+ DYN_FTRACE_TEST_NAME();
+
+ unregister_ftrace_graph(&fixture->gops);
+
+ if (fixture->error_str) {
+ pr_cont("*** %s ***", fixture->error_str);
+ return -1;
+ }
+
+ return 0;
+}
+
+static struct fgraph_fixture store_bytes[4] __initdata = {
+ [0] = {
+ .gops = {
+ .entryfunc = store_entry,
+ .retfunc = store_return,
+ },
+ .store_size = 1,
+ .store_type_name = "byte",
+ },
+ [1] = {
+ .gops = {
+ .entryfunc = store_entry,
+ .retfunc = store_return,
+ },
+ .store_size = 2,
+ .store_type_name = "short",
+ },
+ [2] = {
+ .gops = {
+ .entryfunc = store_entry,
+ .retfunc = store_return,
+ },
+ .store_size = 4,
+ .store_type_name = "word",
+ },
+ [3] = {
+ .gops = {
+ .entryfunc = store_entry,
+ .retfunc = store_return,
+ },
+ .store_size = 8,
+ .store_type_name = "long long",
+ },
+};
+
+static __init int test_graph_storage_multi(void)
+{
+ struct fgraph_fixture *fixture;
+ bool printed = false;
+ int i, ret;
+
+ pr_cont("PASSED\n");
+ pr_info("Testing multiple fgraph storage on a function: ");
+
+ for (i = 0; i < ARRAY_SIZE(store_bytes); i++) {
+ fixture = &store_bytes[i];
+ ret = init_fgraph_fixture(fixture);
+ if (ret && ret != -ENODEV) {
+ pr_cont("*Could not set filter* ");
+ printed = true;
+ goto out;
+ }
+
+ ret = register_ftrace_graph(&fixture->gops);
+ if (ret) {
+ pr_warn("Failed to init store_bytes fgraph tracing\n");
+ printed = true;
+ goto out;
+ }
+ }
+
+ DYN_FTRACE_TEST_NAME();
+out:
+ while (--i >= 0) {
+ fixture = &store_bytes[i];
+ unregister_ftrace_graph(&fixture->gops);
+
+ if (fixture->error_str && !printed) {
+ pr_cont("*** %s ***", fixture->error_str);
+ printed = true;
+ }
+ }
+ return printed ? -1 : 0;
+}
+
+/* Test the storage passed across function_graph entry and return */
+static __init int test_graph_storage(void)
+{
+ int ret;
+
+ ret = test_graph_storage_single(&store_bytes[0]);
+ if (ret)
+ return ret;
+ ret = test_graph_storage_single(&store_bytes[1]);
+ if (ret)
+ return ret;
+ ret = test_graph_storage_single(&store_bytes[2]);
+ if (ret)
+ return ret;
+ ret = test_graph_storage_single(&store_bytes[3]);
+ if (ret)
+ return ret;
+ ret = test_graph_storage_multi();
+ if (ret)
+ return ret;
+ return 0;
+}
+#else
+static inline int test_graph_storage(void) { return 0; }
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
/* Maximum number of functions to trace before diagnosing a hang */
#define GRAPH_MAX_FUNC_TEST 100000000
static unsigned int graph_hang_thresh;
/* Wrap the real function entry probe to avoid possible hanging */
-static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
+static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops)
{
/* This is harmlessly racy, we want to approximately detect a hang */
if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
@@ -776,7 +1025,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
return 0;
}
- return trace_graph_entry(trace);
+ return trace_graph_entry(trace, gops);
}
static struct fgraph_ops fgraph_ops __initdata = {
@@ -812,7 +1061,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
* to detect and recover from possible hangs
*/
tracing_reset_online_cpus(&tr->array_buffer);
- set_graph_array(tr);
+ fgraph_ops.private = tr;
ret = register_ftrace_graph(&fgraph_ops);
if (ret) {
warn_failed_init_tracer(trace, ret);
@@ -855,7 +1104,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
cond_resched();
tracing_reset_online_cpus(&tr->array_buffer);
- set_graph_array(tr);
+ fgraph_ops.private = tr;
/*
* Some archs *cough*PowerPC*cough* add characters to the
@@ -912,6 +1161,8 @@ trace_selftest_startup_function_graph(struct tracer *trace,
ftrace_set_global_filter(NULL, 0, 1);
#endif
+ ret = test_graph_storage();
+
/* Don't test dynamic tracing, the function tracer already did */
out:
/* Stop it if we failed */
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 5a48dba912ea..7f9572a37333 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -514,7 +514,7 @@ static const struct file_operations stack_trace_filter_fops = {
#endif /* CONFIG_DYNAMIC_FTRACE */
int
-stack_trace_sysctl(struct ctl_table *table, int write, void *buffer,
+stack_trace_sysctl(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int was_enabled;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 4252f0645b9e..16b283f9d831 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -76,7 +76,7 @@ void bacct_add_tsk(struct user_namespace *user_ns,
stats->ac_minflt = tsk->min_flt;
stats->ac_majflt = tsk->maj_flt;
- strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm));
+ strscpy_pad(stats->ac_comm, tsk->comm);
}
diff --git a/kernel/umh.c b/kernel/umh.c
index 598b3ffe1522..ff1f13a27d29 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -495,7 +495,7 @@ int call_usermodehelper(const char *path, char **argv, char **envp, int wait)
EXPORT_SYMBOL(call_usermodehelper);
#if defined(CONFIG_SYSCTL)
-static int proc_cap_handler(struct ctl_table *table, int write,
+static int proc_cap_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 76a772072557..7282f61a8650 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -15,7 +15,7 @@
#ifdef CONFIG_PROC_SYSCTL
-static void *get_uts(struct ctl_table *table)
+static void *get_uts(const struct ctl_table *table)
{
char *which = table->data;
struct uts_namespace *uts_ns;
@@ -30,7 +30,7 @@ static void *get_uts(struct ctl_table *table)
* Special case of dostring for the UTS structure. This has locks
* to observe. Should this be in kernel/sys.c ????
*/
-static int proc_do_uts_string(struct ctl_table *table, int write,
+static int proc_do_uts_string(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table uts_table;
diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c
index 1d5eadd9dd61..8b4f8cc2e0ec 100644
--- a/kernel/vmcore_info.c
+++ b/kernel/vmcore_info.c
@@ -216,12 +216,8 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_SYMBOL(kallsyms_num_syms);
VMCOREINFO_SYMBOL(kallsyms_token_table);
VMCOREINFO_SYMBOL(kallsyms_token_index);
-#ifdef CONFIG_KALLSYMS_BASE_RELATIVE
VMCOREINFO_SYMBOL(kallsyms_offsets);
VMCOREINFO_SYMBOL(kallsyms_relative_base);
-#else
- VMCOREINFO_SYMBOL(kallsyms_addresses);
-#endif /* CONFIG_KALLSYMS_BASE_RELATIVE */
#endif /* CONFIG_KALLSYMS */
arch_crash_save_vmcoreinfo();
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 51915b44ac73..830a83895493 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -983,7 +983,7 @@ static void proc_watchdog_update(void)
* -------------------|----------------------------------|-------------------------------
* proc_soft_watchdog | watchdog_softlockup_user_enabled | WATCHDOG_SOFTOCKUP_ENABLED
*/
-static int proc_watchdog_common(int which, struct ctl_table *table, int write,
+static int proc_watchdog_common(int which, const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int err, old, *param = table->data;
@@ -1010,7 +1010,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
/*
* /proc/sys/kernel/watchdog
*/
-static int proc_watchdog(struct ctl_table *table, int write,
+static int proc_watchdog(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED |
@@ -1021,7 +1021,7 @@ static int proc_watchdog(struct ctl_table *table, int write,
/*
* /proc/sys/kernel/nmi_watchdog
*/
-static int proc_nmi_watchdog(struct ctl_table *table, int write,
+static int proc_nmi_watchdog(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
if (!watchdog_hardlockup_available && write)
@@ -1034,7 +1034,7 @@ static int proc_nmi_watchdog(struct ctl_table *table, int write,
/*
* /proc/sys/kernel/soft_watchdog
*/
-static int proc_soft_watchdog(struct ctl_table *table, int write,
+static int proc_soft_watchdog(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
return proc_watchdog_common(WATCHDOG_SOFTOCKUP_ENABLED,
@@ -1045,7 +1045,7 @@ static int proc_soft_watchdog(struct ctl_table *table, int write,
/*
* /proc/sys/kernel/watchdog_thresh
*/
-static int proc_watchdog_thresh(struct ctl_table *table, int write,
+static int proc_watchdog_thresh(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int err, old;
@@ -1068,7 +1068,7 @@ static int proc_watchdog_thresh(struct ctl_table *table, int write,
* user to specify a mask that will include cpus that have not yet
* been brought online, if desired.
*/
-static int proc_watchdog_cpumask(struct ctl_table *table, int write,
+static int proc_watchdog_cpumask(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int err;
diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c
index d577c4a8321e..59c1d86a73a2 100644
--- a/kernel/watchdog_perf.c
+++ b/kernel/watchdog_perf.c
@@ -75,11 +75,15 @@ static bool watchdog_check_timestamp(void)
__this_cpu_write(last_timestamp, now);
return true;
}
-#else
-static inline bool watchdog_check_timestamp(void)
+
+static void watchdog_init_timestamp(void)
{
- return true;
+ __this_cpu_write(nmi_rearmed, 0);
+ __this_cpu_write(last_timestamp, ktime_get_mono_fast_ns());
}
+#else
+static inline bool watchdog_check_timestamp(void) { return true; }
+static inline void watchdog_init_timestamp(void) { }
#endif
static struct perf_event_attr wd_hw_attr = {
@@ -161,6 +165,7 @@ void watchdog_hardlockup_enable(unsigned int cpu)
if (!atomic_fetch_inc(&watchdog_cpus))
pr_info("Enabled. Permanently consumes one hw-PMU counter.\n");
+ watchdog_init_timestamp();
perf_event_enable(this_cpu_read(watchdog_ev));
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3fbaecfc88c2..1745ca788ede 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -216,8 +216,6 @@ struct worker_pool {
struct worker *manager; /* L: purely informational */
struct list_head workers; /* A: attached workers */
- struct list_head dying_workers; /* A: workers about to die */
- struct completion *detach_completion; /* all workers detached */
struct ida worker_ida; /* worker IDs for task name */
@@ -436,7 +434,7 @@ static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES];
static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE;
/* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */
-static struct workqueue_attrs *wq_update_pod_attrs_buf;
+static struct workqueue_attrs *unbound_wq_update_pwq_attrs_buf;
static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
@@ -447,6 +445,9 @@ static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait);
static LIST_HEAD(workqueues); /* PR: list of all workqueues */
static bool workqueue_freezing; /* PL: have wqs started freezing? */
+/* PL: mirror the cpu_online_mask excluding the CPU in the midst of hotplugging */
+static cpumask_var_t wq_online_cpumask;
+
/* PL&A: allowable cpus for unbound wqs and work items */
static cpumask_var_t wq_unbound_cpumask;
@@ -1684,33 +1685,6 @@ static void __pwq_activate_work(struct pool_workqueue *pwq,
__clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb);
}
-/**
- * pwq_activate_work - Activate a work item if inactive
- * @pwq: pool_workqueue @work belongs to
- * @work: work item to activate
- *
- * Returns %true if activated. %false if already active.
- */
-static bool pwq_activate_work(struct pool_workqueue *pwq,
- struct work_struct *work)
-{
- struct worker_pool *pool = pwq->pool;
- struct wq_node_nr_active *nna;
-
- lockdep_assert_held(&pool->lock);
-
- if (!(*work_data_bits(work) & WORK_STRUCT_INACTIVE))
- return false;
-
- nna = wq_node_nr_active(pwq->wq, pool->node);
- if (nna)
- atomic_inc(&nna->nr);
-
- pwq->nr_active++;
- __pwq_activate_work(pwq, work);
- return true;
-}
-
static bool tryinc_node_nr_active(struct wq_node_nr_active *nna)
{
int max = READ_ONCE(nna->max);
@@ -2117,7 +2091,7 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags,
*/
pwq = get_work_pwq(work);
if (pwq && pwq->pool == pool) {
- unsigned long work_data;
+ unsigned long work_data = *work_data_bits(work);
debug_work_deactivate(work);
@@ -2126,13 +2100,17 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags,
* pwq->inactive_works since a queued barrier can't be
* canceled (see the comments in insert_wq_barrier()).
*
- * An inactive work item cannot be grabbed directly because
+ * An inactive work item cannot be deleted directly because
* it might have linked barrier work items which, if left
* on the inactive_works list, will confuse pwq->nr_active
- * management later on and cause stall. Make sure the work
- * item is activated before grabbing.
+ * management later on and cause stall. Move the linked
+ * barrier work items to the worklist when deleting the grabbed
+ * item. Also keep WORK_STRUCT_INACTIVE in work_data, so that
+ * it doesn't participate in nr_active management in later
+ * pwq_dec_nr_in_flight().
*/
- pwq_activate_work(pwq, work);
+ if (work_data & WORK_STRUCT_INACTIVE)
+ move_linked_works(work, &pwq->pool->worklist, NULL);
list_del_init(&work->entry);
@@ -2140,7 +2118,6 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags,
* work->data points to pwq iff queued. Let's point to pool. As
* this destroys work->data needed by the next step, stash it.
*/
- work_data = *work_data_bits(work);
set_work_pool_and_keep_pending(work, pool->id,
pool_offq_flags(pool));
@@ -2298,9 +2275,13 @@ retry:
* If @work was previously on a different pool, it might still be
* running there, in which case the work needs to be queued on that
* pool to guarantee non-reentrancy.
+ *
+ * For ordered workqueue, work items must be queued on the newest pwq
+ * for accurate order management. Guaranteed order also guarantees
+ * non-reentrancy. See the comments above unplug_oldest_pwq().
*/
last_pool = get_work_pool(work);
- if (last_pool && last_pool != pool) {
+ if (last_pool && last_pool != pool && !(wq->flags & __WQ_ORDERED)) {
struct worker *worker;
raw_spin_lock(&last_pool->lock);
@@ -2710,6 +2691,27 @@ static void worker_attach_to_pool(struct worker *worker,
mutex_unlock(&wq_pool_attach_mutex);
}
+static void unbind_worker(struct worker *worker)
+{
+ lockdep_assert_held(&wq_pool_attach_mutex);
+
+ kthread_set_per_cpu(worker->task, -1);
+ if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
+ else
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
+}
+
+
+static void detach_worker(struct worker *worker)
+{
+ lockdep_assert_held(&wq_pool_attach_mutex);
+
+ unbind_worker(worker);
+ list_del(&worker->node);
+ worker->pool = NULL;
+}
+
/**
* worker_detach_from_pool() - detach a worker from its pool
* @worker: worker which is attached to its pool
@@ -2721,26 +2723,16 @@ static void worker_attach_to_pool(struct worker *worker,
static void worker_detach_from_pool(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
- struct completion *detach_completion = NULL;
/* there is one permanent BH worker per CPU which should never detach */
WARN_ON_ONCE(pool->flags & POOL_BH);
mutex_lock(&wq_pool_attach_mutex);
-
- kthread_set_per_cpu(worker->task, -1);
- list_del(&worker->node);
- worker->pool = NULL;
-
- if (list_empty(&pool->workers) && list_empty(&pool->dying_workers))
- detach_completion = pool->detach_completion;
+ detach_worker(worker);
mutex_unlock(&wq_pool_attach_mutex);
/* clear leftover flags without pool->lock after it is detached */
worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
-
- if (detach_completion)
- complete(detach_completion);
}
static int format_worker_id(char *buf, size_t size, struct worker *worker,
@@ -2844,35 +2836,22 @@ fail:
return NULL;
}
-static void unbind_worker(struct worker *worker)
+static void detach_dying_workers(struct list_head *cull_list)
{
- lockdep_assert_held(&wq_pool_attach_mutex);
+ struct worker *worker;
- kthread_set_per_cpu(worker->task, -1);
- if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
- WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
- else
- WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
+ list_for_each_entry(worker, cull_list, entry)
+ detach_worker(worker);
}
-static void wake_dying_workers(struct list_head *cull_list)
+static void reap_dying_workers(struct list_head *cull_list)
{
struct worker *worker, *tmp;
list_for_each_entry_safe(worker, tmp, cull_list, entry) {
list_del_init(&worker->entry);
- unbind_worker(worker);
- /*
- * If the worker was somehow already running, then it had to be
- * in pool->idle_list when set_worker_dying() happened or we
- * wouldn't have gotten here.
- *
- * Thus, the worker must either have observed the WORKER_DIE
- * flag, or have set its state to TASK_IDLE. Either way, the
- * below will be observed by the worker and is safe to do
- * outside of pool->lock.
- */
- wake_up_process(worker->task);
+ kthread_stop_put(worker->task);
+ kfree(worker);
}
}
@@ -2906,7 +2885,9 @@ static void set_worker_dying(struct worker *worker, struct list_head *list)
worker->flags |= WORKER_DIE;
list_move(&worker->entry, list);
- list_move(&worker->node, &pool->dying_workers);
+
+ /* get an extra task struct reference for later kthread_stop_put() */
+ get_task_struct(worker->task);
}
/**
@@ -2965,9 +2946,9 @@ static void idle_cull_fn(struct work_struct *work)
/*
* Grabbing wq_pool_attach_mutex here ensures an already-running worker
- * cannot proceed beyong worker_detach_from_pool() in its self-destruct
- * path. This is required as a previously-preempted worker could run after
- * set_worker_dying() has happened but before wake_dying_workers() did.
+ * cannot proceed beyong set_pf_worker() in its self-destruct path.
+ * This is required as a previously-preempted worker could run after
+ * set_worker_dying() has happened but before detach_dying_workers() did.
*/
mutex_lock(&wq_pool_attach_mutex);
raw_spin_lock_irq(&pool->lock);
@@ -2988,8 +2969,10 @@ static void idle_cull_fn(struct work_struct *work)
}
raw_spin_unlock_irq(&pool->lock);
- wake_dying_workers(&cull_list);
+ detach_dying_workers(&cull_list);
mutex_unlock(&wq_pool_attach_mutex);
+
+ reap_dying_workers(&cull_list);
}
static void send_mayday(struct work_struct *work)
@@ -3368,9 +3351,7 @@ woke_up:
set_pf_worker(false);
ida_free(&pool->worker_ida, worker->id);
- worker_detach_from_pool(worker);
WARN_ON_ONCE(!list_empty(&worker->entry));
- kfree(worker);
return 0;
}
@@ -4761,7 +4742,6 @@ static int init_worker_pool(struct worker_pool *pool)
timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);
INIT_LIST_HEAD(&pool->workers);
- INIT_LIST_HEAD(&pool->dying_workers);
ida_init(&pool->worker_ida);
INIT_HLIST_NODE(&pool->hash_node);
@@ -4903,7 +4883,6 @@ static void rcu_free_pool(struct rcu_head *rcu)
*/
static void put_unbound_pool(struct worker_pool *pool)
{
- DECLARE_COMPLETION_ONSTACK(detach_completion);
struct worker *worker;
LIST_HEAD(cull_list);
@@ -4955,14 +4934,11 @@ static void put_unbound_pool(struct worker_pool *pool)
WARN_ON(pool->nr_workers || pool->nr_idle);
raw_spin_unlock_irq(&pool->lock);
- wake_dying_workers(&cull_list);
+ detach_dying_workers(&cull_list);
- if (!list_empty(&pool->workers) || !list_empty(&pool->dying_workers))
- pool->detach_completion = &detach_completion;
mutex_unlock(&wq_pool_attach_mutex);
- if (pool->detach_completion)
- wait_for_completion(pool->detach_completion);
+ reap_dying_workers(&cull_list);
/* shut down the timers */
del_timer_sync(&pool->idle_timer);
@@ -5038,12 +5014,6 @@ fail:
return NULL;
}
-static void rcu_free_pwq(struct rcu_head *rcu)
-{
- kmem_cache_free(pwq_cache,
- container_of(rcu, struct pool_workqueue, rcu));
-}
-
/*
* Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero
* refcnt and needs to be destroyed.
@@ -5089,7 +5059,7 @@ static void pwq_release_workfn(struct kthread_work *work)
raw_spin_unlock_irq(&nna->lock);
}
- call_rcu(&pwq->rcu, rcu_free_pwq);
+ kfree_rcu(pwq, rcu);
/*
* If we're the last pwq going away, @wq is already dead and no one
@@ -5161,14 +5131,22 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
return pwq;
}
+static void apply_wqattrs_lock(void)
+{
+ mutex_lock(&wq_pool_mutex);
+}
+
+static void apply_wqattrs_unlock(void)
+{
+ mutex_unlock(&wq_pool_mutex);
+}
+
/**
* wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod
* @attrs: the wq_attrs of the default pwq of the target workqueue
* @cpu: the target CPU
- * @cpu_going_down: if >= 0, the CPU to consider as offline
*
- * Calculate the cpumask a workqueue with @attrs should use on @pod. If
- * @cpu_going_down is >= 0, that cpu is considered offline during calculation.
+ * Calculate the cpumask a workqueue with @attrs should use on @pod.
* The result is stored in @attrs->__pod_cpumask.
*
* If pod affinity is not enabled, @attrs->cpumask is always used. If enabled
@@ -5177,29 +5155,18 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
*
* The caller is responsible for ensuring that the cpumask of @pod stays stable.
*/
-static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu,
- int cpu_going_down)
+static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu)
{
const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
int pod = pt->cpu_pod[cpu];
- /* does @pod have any online CPUs @attrs wants? */
+ /* calculate possible CPUs in @pod that @attrs wants */
cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask);
- cpumask_and(attrs->__pod_cpumask, attrs->__pod_cpumask, cpu_online_mask);
- if (cpu_going_down >= 0)
- cpumask_clear_cpu(cpu_going_down, attrs->__pod_cpumask);
-
- if (cpumask_empty(attrs->__pod_cpumask)) {
+ /* does @pod have any online CPUs @attrs wants? */
+ if (!cpumask_intersects(attrs->__pod_cpumask, wq_online_cpumask)) {
cpumask_copy(attrs->__pod_cpumask, attrs->cpumask);
return;
}
-
- /* yeap, return possible CPUs in @pod that @attrs wants */
- cpumask_and(attrs->__pod_cpumask, attrs->cpumask, pt->pod_cpus[pod]);
-
- if (cpumask_empty(attrs->__pod_cpumask))
- pr_warn_once("WARNING: workqueue cpumask: online intersect > "
- "possible intersect\n");
}
/* install @pwq into @wq and return the old pwq, @cpu < 0 for dfl_pwq */
@@ -5284,7 +5251,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
ctx->dfl_pwq->refcnt++;
ctx->pwq_tbl[cpu] = ctx->dfl_pwq;
} else {
- wq_calc_pod_cpumask(new_attrs, cpu, -1);
+ wq_calc_pod_cpumask(new_attrs, cpu);
ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs);
if (!ctx->pwq_tbl[cpu])
goto out_free;
@@ -5375,8 +5342,6 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
*
* Performs GFP_KERNEL allocations.
*
- * Assumes caller has CPU hotplug read exclusion, i.e. cpus_read_lock().
- *
* Return: 0 on success and -errno on failure.
*/
int apply_workqueue_attrs(struct workqueue_struct *wq,
@@ -5384,8 +5349,6 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
{
int ret;
- lockdep_assert_cpus_held();
-
mutex_lock(&wq_pool_mutex);
ret = apply_workqueue_attrs_locked(wq, attrs);
mutex_unlock(&wq_pool_mutex);
@@ -5394,15 +5357,12 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
}
/**
- * wq_update_pod - update pod affinity of a wq for CPU hot[un]plug
+ * unbound_wq_update_pwq - update a pwq slot for CPU hot[un]plug
* @wq: the target workqueue
- * @cpu: the CPU to update pool association for
- * @hotplug_cpu: the CPU coming up or going down
- * @online: whether @cpu is coming up or going down
+ * @cpu: the CPU to update the pwq slot for
*
* This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
- * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update pod affinity of
- * @wq accordingly.
+ * %CPU_DOWN_FAILED. @cpu is in the same pod of the CPU being hot[un]plugged.
*
*
* If pod affinity can't be adjusted due to memory allocation failure, it falls
@@ -5415,10 +5375,8 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
* CPU_DOWN. If a workqueue user wants strict affinity, it's the user's
* responsibility to flush the work item from CPU_DOWN_PREPARE.
*/
-static void wq_update_pod(struct workqueue_struct *wq, int cpu,
- int hotplug_cpu, bool online)
+static void unbound_wq_update_pwq(struct workqueue_struct *wq, int cpu)
{
- int off_cpu = online ? -1 : hotplug_cpu;
struct pool_workqueue *old_pwq = NULL, *pwq;
struct workqueue_attrs *target_attrs;
@@ -5432,13 +5390,13 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu,
* Let's use a preallocated one. The following buf is protected by
* CPU hotplug exclusion.
*/
- target_attrs = wq_update_pod_attrs_buf;
+ target_attrs = unbound_wq_update_pwq_attrs_buf;
copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask);
/* nothing to do if the target cpumask matches the current pwq */
- wq_calc_pod_cpumask(target_attrs, cpu, off_cpu);
+ wq_calc_pod_cpumask(target_attrs, cpu);
if (wqattrs_equal(target_attrs, unbound_pwq(wq, cpu)->pool->attrs))
return;
@@ -5472,21 +5430,24 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
bool highpri = wq->flags & WQ_HIGHPRI;
int cpu, ret;
+ lockdep_assert_held(&wq_pool_mutex);
+
wq->cpu_pwq = alloc_percpu(struct pool_workqueue *);
if (!wq->cpu_pwq)
goto enomem;
if (!(wq->flags & WQ_UNBOUND)) {
+ struct worker_pool __percpu *pools;
+
+ if (wq->flags & WQ_BH)
+ pools = bh_worker_pools;
+ else
+ pools = cpu_worker_pools;
+
for_each_possible_cpu(cpu) {
struct pool_workqueue **pwq_p;
- struct worker_pool __percpu *pools;
struct worker_pool *pool;
- if (wq->flags & WQ_BH)
- pools = bh_worker_pools;
- else
- pools = cpu_worker_pools;
-
pool = &(per_cpu_ptr(pools, cpu)[highpri]);
pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu);
@@ -5504,26 +5465,18 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
return 0;
}
- cpus_read_lock();
if (wq->flags & __WQ_ORDERED) {
struct pool_workqueue *dfl_pwq;
- ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
+ ret = apply_workqueue_attrs_locked(wq, ordered_wq_attrs[highpri]);
/* there should only be single pwq for ordering guarantee */
dfl_pwq = rcu_access_pointer(wq->dfl_pwq);
WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node ||
wq->pwqs.prev != &dfl_pwq->pwqs_node),
"ordering guarantee broken for workqueue %s\n", wq->name);
} else {
- ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
+ ret = apply_workqueue_attrs_locked(wq, unbound_std_wq_attrs[highpri]);
}
- cpus_read_unlock();
-
- /* for unbound pwq, flush the pwq_release_worker ensures that the
- * pwq_release_workfn() completes before calling kfree(wq).
- */
- if (ret)
- kthread_flush_worker(pwq_release_worker);
return ret;
@@ -5561,6 +5514,8 @@ static int init_rescuer(struct workqueue_struct *wq)
char id_buf[WORKER_ID_LEN];
int ret;
+ lockdep_assert_held(&wq_pool_mutex);
+
if (!(wq->flags & WQ_MEM_RECLAIM))
return 0;
@@ -5585,7 +5540,7 @@ static int init_rescuer(struct workqueue_struct *wq)
wq->rescuer = rescuer;
if (wq->flags & WQ_UNBOUND)
- kthread_bind_mask(rescuer->task, wq_unbound_cpumask);
+ kthread_bind_mask(rescuer->task, unbound_effective_cpumask(wq));
else
kthread_bind_mask(rescuer->task, cpu_possible_mask);
wake_up_process(rescuer->task);
@@ -5733,21 +5688,14 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
goto err_unreg_lockdep;
}
- if (alloc_and_link_pwqs(wq) < 0)
- goto err_free_node_nr_active;
-
- if (wq_online && init_rescuer(wq) < 0)
- goto err_destroy;
-
- if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
- goto err_destroy;
-
/*
- * wq_pool_mutex protects global freeze state and workqueues list.
- * Grab it, adjust max_active and add the new @wq to workqueues
- * list.
+ * wq_pool_mutex protects the workqueues list, allocations of PWQs,
+ * and the global freeze state.
*/
- mutex_lock(&wq_pool_mutex);
+ apply_wqattrs_lock();
+
+ if (alloc_and_link_pwqs(wq) < 0)
+ goto err_unlock_free_node_nr_active;
mutex_lock(&wq->mutex);
wq_adjust_max_active(wq);
@@ -5755,13 +5703,27 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
list_add_tail_rcu(&wq->list, &workqueues);
- mutex_unlock(&wq_pool_mutex);
+ if (wq_online && init_rescuer(wq) < 0)
+ goto err_unlock_destroy;
+
+ apply_wqattrs_unlock();
+
+ if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
+ goto err_destroy;
return wq;
-err_free_node_nr_active:
- if (wq->flags & WQ_UNBOUND)
+err_unlock_free_node_nr_active:
+ apply_wqattrs_unlock();
+ /*
+ * Failed alloc_and_link_pwqs() may leave pending pwq->release_work,
+ * flushing the pwq_release_worker ensures that the pwq_release_workfn()
+ * completes before calling kfree(wq).
+ */
+ if (wq->flags & WQ_UNBOUND) {
+ kthread_flush_worker(pwq_release_worker);
free_node_nr_active(wq->node_nr_active);
+ }
err_unreg_lockdep:
wq_unregister_lockdep(wq);
wq_free_lockdep(wq);
@@ -5769,6 +5731,8 @@ err_free_wq:
free_workqueue_attrs(wq->unbound_attrs);
kfree(wq);
return NULL;
+err_unlock_destroy:
+ apply_wqattrs_unlock();
err_destroy:
destroy_workqueue(wq);
return NULL;
@@ -6607,6 +6571,8 @@ int workqueue_online_cpu(unsigned int cpu)
mutex_lock(&wq_pool_mutex);
+ cpumask_set_cpu(cpu, wq_online_cpumask);
+
for_each_pool(pool, pi) {
/* BH pools aren't affected by hotplug */
if (pool->flags & POOL_BH)
@@ -6629,7 +6595,7 @@ int workqueue_online_cpu(unsigned int cpu)
int tcpu;
for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
- wq_update_pod(wq, tcpu, cpu, true);
+ unbound_wq_update_pwq(wq, tcpu);
mutex_lock(&wq->mutex);
wq_update_node_max_active(wq, -1);
@@ -6653,6 +6619,9 @@ int workqueue_offline_cpu(unsigned int cpu)
/* update pod affinity of unbound workqueues */
mutex_lock(&wq_pool_mutex);
+
+ cpumask_clear_cpu(cpu, wq_online_cpumask);
+
list_for_each_entry(wq, &workqueues, list) {
struct workqueue_attrs *attrs = wq->unbound_attrs;
@@ -6661,7 +6630,7 @@ int workqueue_offline_cpu(unsigned int cpu)
int tcpu;
for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
- wq_update_pod(wq, tcpu, cpu, false);
+ unbound_wq_update_pwq(wq, tcpu);
mutex_lock(&wq->mutex);
wq_update_node_max_active(wq, cpu);
@@ -6887,8 +6856,7 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
* @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask
*
* This function can be called from cpuset code to provide a set of isolated
- * CPUs that should be excluded from wq_unbound_cpumask. The caller must hold
- * either cpus_read_lock or cpus_write_lock.
+ * CPUs that should be excluded from wq_unbound_cpumask.
*/
int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
{
@@ -6898,12 +6866,8 @@ int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
return -ENOMEM;
- lockdep_assert_cpus_held();
mutex_lock(&wq_pool_mutex);
- /* Save the current isolated cpumask & export it via sysfs */
- cpumask_copy(wq_isolated_cpumask, exclude_cpumask);
-
/*
* If the operation fails, it will fall back to
* wq_requested_unbound_cpumask which is initially set to
@@ -6915,6 +6879,10 @@ int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
if (!cpumask_equal(cpumask, wq_unbound_cpumask))
ret = workqueue_apply_unbound_cpumask(cpumask);
+ /* Save the current isolated cpumask & export it via sysfs */
+ if (!ret)
+ cpumask_copy(wq_isolated_cpumask, exclude_cpumask);
+
mutex_unlock(&wq_pool_mutex);
free_cpumask_var(cpumask);
return ret;
@@ -6948,9 +6916,8 @@ static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp)
wq_affn_dfl = affn;
list_for_each_entry(wq, &workqueues, list) {
- for_each_online_cpu(cpu) {
- wq_update_pod(wq, cpu, cpu, true);
- }
+ for_each_online_cpu(cpu)
+ unbound_wq_update_pwq(wq, cpu);
}
mutex_unlock(&wq_pool_mutex);
@@ -7038,19 +7005,6 @@ static struct attribute *wq_sysfs_attrs[] = {
};
ATTRIBUTE_GROUPS(wq_sysfs);
-static void apply_wqattrs_lock(void)
-{
- /* CPUs should stay stable across pwq creations and installations */
- cpus_read_lock();
- mutex_lock(&wq_pool_mutex);
-}
-
-static void apply_wqattrs_unlock(void)
-{
- mutex_unlock(&wq_pool_mutex);
- cpus_read_unlock();
-}
-
static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
@@ -7249,16 +7203,12 @@ static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
*/
cpumask_and(cpumask, cpumask, cpu_possible_mask);
if (!cpumask_empty(cpumask)) {
+ ret = 0;
apply_wqattrs_lock();
- cpumask_copy(wq_requested_unbound_cpumask, cpumask);
- if (cpumask_equal(cpumask, wq_unbound_cpumask)) {
- ret = 0;
- goto out_unlock;
- }
-
- ret = workqueue_apply_unbound_cpumask(cpumask);
-
-out_unlock:
+ if (!cpumask_equal(cpumask, wq_unbound_cpumask))
+ ret = workqueue_apply_unbound_cpumask(cpumask);
+ if (!ret)
+ cpumask_copy(wq_requested_unbound_cpumask, cpumask);
apply_wqattrs_unlock();
}
@@ -7577,10 +7527,18 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
notrace void wq_watchdog_touch(int cpu)
{
+ unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
+ unsigned long touch_ts = READ_ONCE(wq_watchdog_touched);
+ unsigned long now = jiffies;
+
if (cpu >= 0)
- per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
+ per_cpu(wq_watchdog_touched_cpu, cpu) = now;
+ else
+ WARN_ONCE(1, "%s should be called with valid CPU", __func__);
- wq_watchdog_touched = jiffies;
+ /* Don't unnecessarily store to global cacheline */
+ if (time_after(now, touch_ts + thresh / 4))
+ WRITE_ONCE(wq_watchdog_touched, jiffies);
}
static void wq_watchdog_set_thresh(unsigned long thresh)
@@ -7690,10 +7648,12 @@ void __init workqueue_init_early(void)
BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
+ BUG_ON(!alloc_cpumask_var(&wq_online_cpumask, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL));
+ cpumask_copy(wq_online_cpumask, cpu_online_mask);
cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ));
restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN));
@@ -7704,8 +7664,8 @@ void __init workqueue_init_early(void)
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
- wq_update_pod_attrs_buf = alloc_workqueue_attrs();
- BUG_ON(!wq_update_pod_attrs_buf);
+ unbound_wq_update_pwq_attrs_buf = alloc_workqueue_attrs();
+ BUG_ON(!unbound_wq_update_pwq_attrs_buf);
/*
* If nohz_full is enabled, set power efficient workqueue as unbound.
@@ -7970,12 +7930,12 @@ void __init workqueue_init_topology(void)
/*
* Workqueues allocated earlier would have all CPUs sharing the default
- * worker pool. Explicitly call wq_update_pod() on all workqueue and CPU
- * combinations to apply per-pod sharing.
+ * worker pool. Explicitly call unbound_wq_update_pwq() on all workqueue
+ * and CPU combinations to apply per-pod sharing.
*/
list_for_each_entry(wq, &workqueues, list) {
for_each_online_cpu(cpu)
- wq_update_pod(wq, cpu, cpu, true);
+ unbound_wq_update_pwq(wq, cpu);
if (wq->flags & WQ_UNBOUND) {
mutex_lock(&wq->mutex);
wq_update_node_max_active(wq, -1);