summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/acct.c22
-rw-r--r--kernel/audit_fsnotify.c5
-rw-r--r--kernel/audit_tree.c34
-rw-r--r--kernel/audit_watch.c2
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/arraymap.c61
-rw-r--r--kernel/bpf/bloom_filter.c6
-rw-r--r--kernel/bpf/bpf_inode_storage.c10
-rw-r--r--kernel/bpf/bpf_iter.c32
-rw-r--r--kernel/bpf/bpf_local_storage.c29
-rw-r--r--kernel/bpf/bpf_lru_list.h1
-rw-r--r--kernel/bpf/bpf_lsm.c17
-rw-r--r--kernel/bpf/bpf_struct_ops.c81
-rw-r--r--kernel/bpf/bpf_task_storage.c9
-rw-r--r--kernel/bpf/btf.c640
-rw-r--r--kernel/bpf/cgroup.c106
-rw-r--r--kernel/bpf/core.c43
-rw-r--r--kernel/bpf/cpumap.c6
-rw-r--r--kernel/bpf/devmap.c10
-rw-r--r--kernel/bpf/hashtab.c133
-rw-r--r--kernel/bpf/helpers.c223
-rw-r--r--kernel/bpf/link_iter.c107
-rw-r--r--kernel/bpf/local_storage.c7
-rw-r--r--kernel/bpf/lpm_trie.c6
-rw-r--r--kernel/bpf/map_in_map.c5
-rw-r--r--kernel/bpf/queue_stack_maps.c10
-rw-r--r--kernel/bpf/reuseport_array.c6
-rw-r--r--kernel/bpf/ringbuf.c88
-rw-r--r--kernel/bpf/stackmap.c7
-rw-r--r--kernel/bpf/syscall.c500
-rw-r--r--kernel/bpf/task_iter.c1
-rw-r--r--kernel/bpf/trampoline.c118
-rw-r--r--kernel/bpf/verifier.c819
-rw-r--r--kernel/cgroup/cgroup-internal.h1
-rw-r--r--kernel/cgroup/cgroup.c4
-rw-r--r--kernel/crash_core.c3
-rw-r--r--kernel/debug/debug_core.c24
-rw-r--r--kernel/debug/kdb/kdb_io.c1
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c1
-rw-r--r--kernel/debug/kdb/kdb_main.c111
-rw-r--r--kernel/debug/kdb/kdb_private.h4
-rw-r--r--kernel/debug/kdb/kdb_support.c1
-rw-r--r--kernel/delayacct.c22
-rw-r--r--kernel/dma/debug.c2
-rw-r--r--kernel/dma/direct.c31
-rw-r--r--kernel/dma/direct.h2
-rw-r--r--kernel/dma/swiotlb.c186
-rw-r--r--kernel/events/uprobes.c9
-rw-r--r--kernel/fork.c50
-rw-r--r--kernel/futex/futex.h1
-rwxr-xr-xkernel/gen_kheaders.sh2
-rw-r--r--kernel/hung_task.c13
-rw-r--r--kernel/kallsyms.c3
-rw-r--r--kernel/kcov.c14
-rw-r--r--kernel/kcsan/kcsan_test.c31
-rw-r--r--kernel/kexec_core.c24
-rw-r--r--kernel/kexec_file.c38
-rw-r--r--kernel/kprobes.c144
-rw-r--r--kernel/latencytop.c41
-rw-r--r--kernel/livepatch/patch.c2
-rw-r--r--kernel/locking/lockdep.c35
-rw-r--r--kernel/module-internal.h50
-rw-r--r--kernel/module/Makefile21
-rw-r--r--kernel/module/debug_kmemleak.c30
-rw-r--r--kernel/module/decompress.c (renamed from kernel/module_decompress.c)5
-rw-r--r--kernel/module/internal.h302
-rw-r--r--kernel/module/kallsyms.c512
-rw-r--r--kernel/module/kdb.c62
-rw-r--r--kernel/module/livepatch.c74
-rw-r--r--kernel/module/main.c (renamed from kernel/module.c)1967
-rw-r--r--kernel/module/procfs.c146
-rw-r--r--kernel/module/signing.c125
-rw-r--r--kernel/module/strict_rwx.c143
-rw-r--r--kernel/module/sysfs.c436
-rw-r--r--kernel/module/tracking.c61
-rw-r--r--kernel/module/tree_lookup.c117
-rw-r--r--kernel/module/version.c101
-rw-r--r--kernel/module_signing.c45
-rw-r--r--kernel/notifier.c101
-rw-r--r--kernel/panic.c32
-rw-r--r--kernel/pid_namespace.c2
-rw-r--r--kernel/power/Makefile6
-rw-r--r--kernel/power/energy_model.c65
-rw-r--r--kernel/power/hibernate.c2
-rw-r--r--kernel/power/main.c34
-rw-r--r--kernel/power/process.c3
-rw-r--r--kernel/power/snapshot.c12
-rw-r--r--kernel/power/suspend.c3
-rw-r--r--kernel/printk/printk.c1203
-rw-r--r--kernel/ptrace.c93
-rw-r--r--kernel/rcu/rcu.h2
-rw-r--r--kernel/rcu/tree_stall.h2
-rw-r--r--kernel/reboot.c396
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/sched/core.c135
-rw-r--r--kernel/sched/deadline.c42
-rw-r--r--kernel/sched/fair.c34
-rw-r--r--kernel/sched/rt.c63
-rw-r--r--kernel/sched/sched.h7
-rw-r--r--kernel/sched/topology.c25
-rw-r--r--kernel/seccomp.c44
-rw-r--r--kernel/signal.c140
-rw-r--r--kernel/stackleak.c105
-rw-r--r--kernel/sysctl.c379
-rw-r--r--kernel/taskstats.c24
-rw-r--r--kernel/time/posix-cpu-timers.c6
-rw-r--r--kernel/time/timekeeping.c15
-rw-r--r--kernel/time/timer.c2
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/bpf_trace.c144
-rw-r--r--kernel/trace/fgraph.c2
-rw-r--r--kernel/trace/fprobe.c32
-rw-r--r--kernel/trace/ftrace.c373
-rw-r--r--kernel/trace/pid_list.c4
-rw-r--r--kernel/trace/ring_buffer.c81
-rw-r--r--kernel/trace/trace.c72
-rw-r--r--kernel/trace/trace.h26
-rw-r--r--kernel/trace/trace_boot.c2
-rw-r--r--kernel/trace/trace_dynevent.c9
-rw-r--r--kernel/trace/trace_eprobe.c24
-rw-r--r--kernel/trace/trace_events.c69
-rw-r--r--kernel/trace/trace_events_filter.c2
-rw-r--r--kernel/trace/trace_events_hist.c191
-rw-r--r--kernel/trace/trace_events_trigger.c324
-rw-r--r--kernel/trace/trace_kprobe.c15
-rw-r--r--kernel/trace/trace_osnoise.c22
-rw-r--r--kernel/trace/trace_output.c25
-rw-r--r--kernel/trace/trace_recursion_record.c7
-rw-r--r--kernel/trace/trace_selftest.c3
-rw-r--r--kernel/trace/trace_syscalls.c35
-rw-r--r--kernel/trace/tracing_map.c3
-rw-r--r--kernel/tsacct.c10
-rw-r--r--kernel/umh.c6
-rw-r--r--kernel/usermode_driver.c4
-rw-r--r--kernel/watchdog.c8
-rw-r--r--kernel/watchdog_hld.c4
-rw-r--r--kernel/workqueue.c2
138 files changed, 8133 insertions, 4497 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 847a82bfe0e3..318789c728d3 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -29,7 +29,6 @@ KCOV_INSTRUMENT_softirq.o := n
KCSAN_SANITIZE_softirq.o = n
# These are called from save_stack_trace() on slub debug path,
# and produce insane amounts of uninteresting coverage.
-KCOV_INSTRUMENT_module.o := n
KCOV_INSTRUMENT_extable.o := n
KCOV_INSTRUMENT_stacktrace.o := n
# Don't self-instrument.
@@ -53,6 +52,7 @@ obj-y += rcu/
obj-y += livepatch/
obj-y += dma/
obj-y += entry/
+obj-$(CONFIG_MODULES) += module/
obj-$(CONFIG_KCMP) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
@@ -66,9 +66,6 @@ ifneq ($(CONFIG_SMP),y)
obj-y += up.o
endif
obj-$(CONFIG_UID16) += uid16.o
-obj-$(CONFIG_MODULES) += module.o
-obj-$(CONFIG_MODULE_DECOMPRESS) += module_decompress.o
-obj-$(CONFIG_MODULE_SIG) += module_signing.o
obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 3df53cf1dcd5..13706356ec54 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -70,11 +70,31 @@
* Turned into sysctl-controllable parameters. AV, 12/11/98
*/
-int acct_parm[3] = {4, 2, 30};
+static int acct_parm[3] = {4, 2, 30};
#define RESUME (acct_parm[0]) /* >foo% free space - resume */
#define SUSPEND (acct_parm[1]) /* <foo% free space - suspend */
#define ACCT_TIMEOUT (acct_parm[2]) /* foo second timeout between checks */
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kern_acct_table[] = {
+ {
+ .procname = "acct",
+ .data = &acct_parm,
+ .maxlen = 3*sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+static __init int kernel_acct_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_acct_table);
+ return 0;
+}
+late_initcall(kernel_acct_sysctls_init);
+#endif /* CONFIG_SYSCTL */
+
/*
* External references and all of the globals.
*/
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 02348b48447c..6432a37ac1c9 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -100,7 +100,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
audit_update_mark(audit_mark, dentry->d_inode);
audit_mark->rule = krule;
- ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, true);
+ ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, 0);
if (ret < 0) {
fsnotify_put_mark(&audit_mark->mark);
audit_mark = ERR_PTR(ret);
@@ -181,7 +181,8 @@ static const struct fsnotify_ops audit_mark_fsnotify_ops = {
static int __init audit_fsnotify_init(void)
{
- audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops);
+ audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops,
+ FSNOTIFY_GROUP_DUPS);
if (IS_ERR(audit_fsnotify_group)) {
audit_fsnotify_group = NULL;
audit_panic("cannot create audit fsnotify group");
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index e7315d487163..e867c17d3f84 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -351,7 +351,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark)
struct audit_chunk *new;
int size;
- mutex_lock(&audit_tree_group->mark_mutex);
+ fsnotify_group_lock(audit_tree_group);
/*
* mark_mutex stabilizes chunk attached to the mark so we can check
* whether it didn't change while we've dropped hash_lock.
@@ -368,7 +368,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark)
replace_mark_chunk(mark, NULL);
spin_unlock(&hash_lock);
fsnotify_detach_mark(mark);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
audit_mark_put_chunk(chunk);
fsnotify_free_mark(mark);
return;
@@ -385,12 +385,12 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark)
*/
replace_chunk(new, chunk);
spin_unlock(&hash_lock);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
audit_mark_put_chunk(chunk);
return;
out_mutex:
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
}
/* Call with group->mark_mutex held, releases it */
@@ -400,19 +400,19 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
struct audit_chunk *chunk = alloc_chunk(1);
if (!chunk) {
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
return -ENOMEM;
}
mark = alloc_mark();
if (!mark) {
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
kfree(chunk);
return -ENOMEM;
}
if (fsnotify_add_inode_mark_locked(mark, inode, 0)) {
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
fsnotify_put_mark(mark);
kfree(chunk);
return -ENOSPC;
@@ -422,7 +422,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
if (tree->goner) {
spin_unlock(&hash_lock);
fsnotify_detach_mark(mark);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
fsnotify_free_mark(mark);
fsnotify_put_mark(mark);
kfree(chunk);
@@ -444,7 +444,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
*/
insert_hash(chunk);
spin_unlock(&hash_lock);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
/*
* Drop our initial reference. When mark we point to is getting freed,
* we get notification through ->freeing_mark callback and cleanup
@@ -462,7 +462,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
struct audit_node *p;
int n;
- mutex_lock(&audit_tree_group->mark_mutex);
+ fsnotify_group_lock(audit_tree_group);
mark = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_tree_group);
if (!mark)
return create_chunk(inode, tree);
@@ -478,7 +478,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
for (n = 0; n < old->count; n++) {
if (old->owners[n].owner == tree) {
spin_unlock(&hash_lock);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
fsnotify_put_mark(mark);
return 0;
}
@@ -487,7 +487,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
chunk = alloc_chunk(old->count + 1);
if (!chunk) {
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
fsnotify_put_mark(mark);
return -ENOMEM;
}
@@ -495,7 +495,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
spin_lock(&hash_lock);
if (tree->goner) {
spin_unlock(&hash_lock);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
fsnotify_put_mark(mark);
kfree(chunk);
return 0;
@@ -515,7 +515,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
*/
replace_chunk(chunk, old);
spin_unlock(&hash_lock);
- mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_group_unlock(audit_tree_group);
fsnotify_put_mark(mark); /* pair to fsnotify_find_mark */
audit_mark_put_chunk(old);
@@ -1044,12 +1044,12 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *mark,
{
struct audit_chunk *chunk;
- mutex_lock(&mark->group->mark_mutex);
+ fsnotify_group_lock(mark->group);
spin_lock(&hash_lock);
chunk = mark_chunk(mark);
replace_mark_chunk(mark, NULL);
spin_unlock(&hash_lock);
- mutex_unlock(&mark->group->mark_mutex);
+ fsnotify_group_unlock(mark->group);
if (chunk) {
evict_chunk(chunk);
audit_mark_put_chunk(chunk);
@@ -1074,7 +1074,7 @@ static int __init audit_tree_init(void)
audit_tree_mark_cachep = KMEM_CACHE(audit_tree_mark, SLAB_PANIC);
- audit_tree_group = fsnotify_alloc_group(&audit_tree_ops);
+ audit_tree_group = fsnotify_alloc_group(&audit_tree_ops, 0);
if (IS_ERR(audit_tree_group))
audit_panic("cannot initialize fsnotify group for rectree watches");
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 713b256be944..4b0957aa2cd4 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -493,7 +493,7 @@ static const struct fsnotify_ops audit_watch_fsnotify_ops = {
static int __init audit_watch_init(void)
{
- audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops);
+ audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops, 0);
if (IS_ERR(audit_watch_group)) {
audit_watch_group = NULL;
audit_panic("cannot create audit fsnotify group");
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index c1a9be6a4b9f..057ba8e01e70 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
endif
CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 7f145aefbff8..fe40d3b9458f 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -11,6 +11,7 @@
#include <linux/perf_event.h>
#include <uapi/linux/btf.h>
#include <linux/rcupdate_trace.h>
+#include <linux/btf_ids.h>
#include "map_in_map.h"
@@ -242,6 +243,20 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
return this_cpu_ptr(array->pptrs[index & array->index_mask]);
}
+static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+
+ if (cpu >= nr_cpu_ids)
+ return NULL;
+
+ if (unlikely(index >= array->map.max_entries))
+ return NULL;
+
+ return per_cpu_ptr(array->pptrs[index & array->index_mask], cpu);
+}
+
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
@@ -287,10 +302,12 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key
return 0;
}
-static void check_and_free_timer_in_array(struct bpf_array *arr, void *val)
+static void check_and_free_fields(struct bpf_array *arr, void *val)
{
- if (unlikely(map_value_has_timer(&arr->map)))
+ if (map_value_has_timer(&arr->map))
bpf_timer_cancel_and_free(val + arr->map.timer_off);
+ if (map_value_has_kptrs(&arr->map))
+ bpf_map_free_kptrs(&arr->map, val);
}
/* Called from syscall or from eBPF program */
@@ -327,7 +344,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
copy_map_value_locked(map, val, value, false);
else
copy_map_value(map, val, value);
- check_and_free_timer_in_array(array, val);
+ check_and_free_fields(array, val);
}
return 0;
}
@@ -386,7 +403,8 @@ static void array_map_free_timers(struct bpf_map *map)
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
- if (likely(!map_value_has_timer(map)))
+ /* We don't reset or free kptr on uref dropping to zero. */
+ if (!map_value_has_timer(map))
return;
for (i = 0; i < array->map.max_entries; i++)
@@ -398,6 +416,13 @@ static void array_map_free_timers(struct bpf_map *map)
static void array_map_free(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
+ int i;
+
+ if (map_value_has_kptrs(map)) {
+ for (i = 0; i < array->map.max_entries; i++)
+ bpf_map_free_kptrs(map, array->value + array->elem_size * i);
+ bpf_map_free_kptr_off_tab(map);
+ }
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
bpf_array_free_percpu(array);
@@ -680,7 +705,7 @@ static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_
return num_elems;
}
-static int array_map_btf_id;
+BTF_ID_LIST_SINGLE(array_map_btf_ids, struct, bpf_array)
const struct bpf_map_ops array_map_ops = {
.map_meta_equal = array_map_meta_equal,
.map_alloc_check = array_map_alloc_check,
@@ -701,12 +726,10 @@ const struct bpf_map_ops array_map_ops = {
.map_update_batch = generic_map_update_batch,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_array_elem,
- .map_btf_name = "bpf_array",
- .map_btf_id = &array_map_btf_id,
+ .map_btf_id = &array_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
};
-static int percpu_array_map_btf_id;
const struct bpf_map_ops percpu_array_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = array_map_alloc_check,
@@ -716,14 +739,14 @@ const struct bpf_map_ops percpu_array_map_ops = {
.map_lookup_elem = percpu_array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
+ .map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem,
.map_seq_show_elem = percpu_array_map_seq_show_elem,
.map_check_btf = array_map_check_btf,
.map_lookup_batch = generic_map_lookup_batch,
.map_update_batch = generic_map_update_batch,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_array_elem,
- .map_btf_name = "bpf_array",
- .map_btf_id = &percpu_array_map_btf_id,
+ .map_btf_id = &array_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
};
@@ -1102,7 +1125,6 @@ static void prog_array_map_free(struct bpf_map *map)
* Thus, prog_array_map cannot be used as an inner_map
* and map_meta_equal is not implemented.
*/
-static int prog_array_map_btf_id;
const struct bpf_map_ops prog_array_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = prog_array_map_alloc,
@@ -1118,8 +1140,7 @@ const struct bpf_map_ops prog_array_map_ops = {
.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
.map_release_uref = prog_array_map_clear,
.map_seq_show_elem = prog_array_map_seq_show_elem,
- .map_btf_name = "bpf_array",
- .map_btf_id = &prog_array_map_btf_id,
+ .map_btf_id = &array_map_btf_ids[0],
};
static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
@@ -1208,7 +1229,6 @@ static void perf_event_fd_array_map_free(struct bpf_map *map)
fd_array_map_free(map);
}
-static int perf_event_array_map_btf_id;
const struct bpf_map_ops perf_event_array_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = fd_array_map_alloc_check,
@@ -1221,8 +1241,7 @@ const struct bpf_map_ops perf_event_array_map_ops = {
.map_fd_put_ptr = perf_event_fd_array_put_ptr,
.map_release = perf_event_fd_array_release,
.map_check_btf = map_check_no_btf,
- .map_btf_name = "bpf_array",
- .map_btf_id = &perf_event_array_map_btf_id,
+ .map_btf_id = &array_map_btf_ids[0],
};
#ifdef CONFIG_CGROUPS
@@ -1245,7 +1264,6 @@ static void cgroup_fd_array_free(struct bpf_map *map)
fd_array_map_free(map);
}
-static int cgroup_array_map_btf_id;
const struct bpf_map_ops cgroup_array_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = fd_array_map_alloc_check,
@@ -1257,8 +1275,7 @@ const struct bpf_map_ops cgroup_array_map_ops = {
.map_fd_get_ptr = cgroup_fd_array_get_ptr,
.map_fd_put_ptr = cgroup_fd_array_put_ptr,
.map_check_btf = map_check_no_btf,
- .map_btf_name = "bpf_array",
- .map_btf_id = &cgroup_array_map_btf_id,
+ .map_btf_id = &array_map_btf_ids[0],
};
#endif
@@ -1332,7 +1349,6 @@ static int array_of_map_gen_lookup(struct bpf_map *map,
return insn - insn_buf;
}
-static int array_of_maps_map_btf_id;
const struct bpf_map_ops array_of_maps_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_of_map_alloc,
@@ -1344,7 +1360,8 @@ const struct bpf_map_ops array_of_maps_map_ops = {
.map_fd_put_ptr = bpf_map_fd_put_ptr,
.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
.map_gen_lookup = array_of_map_gen_lookup,
+ .map_lookup_batch = generic_map_lookup_batch,
+ .map_update_batch = generic_map_update_batch,
.map_check_btf = map_check_no_btf,
- .map_btf_name = "bpf_array",
- .map_btf_id = &array_of_maps_map_btf_id,
+ .map_btf_id = &array_map_btf_ids[0],
};
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
index b141a1346f72..b9ea539a5561 100644
--- a/kernel/bpf/bloom_filter.c
+++ b/kernel/bpf/bloom_filter.c
@@ -7,6 +7,7 @@
#include <linux/err.h>
#include <linux/jhash.h>
#include <linux/random.h>
+#include <linux/btf_ids.h>
#define BLOOM_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_ZERO_SEED | BPF_F_ACCESS_MASK)
@@ -192,7 +193,7 @@ static int bloom_map_check_btf(const struct bpf_map *map,
return btf_type_is_void(key_type) ? 0 : -EINVAL;
}
-static int bpf_bloom_map_btf_id;
+BTF_ID_LIST_SINGLE(bpf_bloom_map_btf_ids, struct, bpf_bloom_filter)
const struct bpf_map_ops bloom_filter_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = bloom_map_alloc,
@@ -205,6 +206,5 @@ const struct bpf_map_ops bloom_filter_map_ops = {
.map_update_elem = bloom_map_update_elem,
.map_delete_elem = bloom_map_delete_elem,
.map_check_btf = bloom_map_check_btf,
- .map_btf_name = "bpf_bloom_filter",
- .map_btf_id = &bpf_bloom_map_btf_id,
+ .map_btf_id = &bpf_bloom_map_btf_ids[0],
};
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 96be8d518885..5f7683b19199 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -90,7 +90,7 @@ void bpf_inode_storage_free(struct inode *inode)
*/
bpf_selem_unlink_map(selem);
free_inode_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, false);
+ local_storage, selem, false, false);
}
raw_spin_unlock_bh(&local_storage->lock);
rcu_read_unlock();
@@ -149,7 +149,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
if (!sdata)
return -ENOENT;
- bpf_selem_unlink(SELEM(sdata));
+ bpf_selem_unlink(SELEM(sdata), true);
return 0;
}
@@ -245,7 +245,8 @@ static void inode_storage_map_free(struct bpf_map *map)
bpf_local_storage_map_free(smap, NULL);
}
-static int inode_storage_map_btf_id;
+BTF_ID_LIST_SINGLE(inode_storage_map_btf_ids, struct,
+ bpf_local_storage_map)
const struct bpf_map_ops inode_storage_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = bpf_local_storage_map_alloc_check,
@@ -256,8 +257,7 @@ const struct bpf_map_ops inode_storage_map_ops = {
.map_update_elem = bpf_fd_inode_storage_update_elem,
.map_delete_elem = bpf_fd_inode_storage_delete_elem,
.map_check_btf = bpf_local_storage_map_check_btf,
- .map_btf_name = "bpf_local_storage_map",
- .map_btf_id = &inode_storage_map_btf_id,
+ .map_btf_id = &inode_storage_map_btf_ids[0],
.map_owner_storage_ptr = inode_storage_ptr,
};
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 110029ede71e..d5d96ceca105 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -330,35 +330,34 @@ static void cache_btf_id(struct bpf_iter_target_info *tinfo,
bool bpf_iter_prog_supported(struct bpf_prog *prog)
{
const char *attach_fname = prog->aux->attach_func_name;
+ struct bpf_iter_target_info *tinfo = NULL, *iter;
u32 prog_btf_id = prog->aux->attach_btf_id;
const char *prefix = BPF_ITER_FUNC_PREFIX;
- struct bpf_iter_target_info *tinfo;
int prefix_len = strlen(prefix);
- bool supported = false;
if (strncmp(attach_fname, prefix, prefix_len))
return false;
mutex_lock(&targets_mutex);
- list_for_each_entry(tinfo, &targets, list) {
- if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) {
- supported = true;
+ list_for_each_entry(iter, &targets, list) {
+ if (iter->btf_id && iter->btf_id == prog_btf_id) {
+ tinfo = iter;
break;
}
- if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) {
- cache_btf_id(tinfo, prog);
- supported = true;
+ if (!strcmp(attach_fname + prefix_len, iter->reg_info->target)) {
+ cache_btf_id(iter, prog);
+ tinfo = iter;
break;
}
}
mutex_unlock(&targets_mutex);
- if (supported) {
+ if (tinfo) {
prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size;
prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info;
}
- return supported;
+ return tinfo != NULL;
}
const struct bpf_func_proto *
@@ -499,12 +498,11 @@ bool bpf_link_is_iter(struct bpf_link *link)
int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
struct bpf_prog *prog)
{
+ struct bpf_iter_target_info *tinfo = NULL, *iter;
struct bpf_link_primer link_primer;
- struct bpf_iter_target_info *tinfo;
union bpf_iter_link_info linfo;
struct bpf_iter_link *link;
u32 prog_btf_id, linfo_len;
- bool existed = false;
bpfptr_t ulinfo;
int err;
@@ -530,14 +528,14 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
prog_btf_id = prog->aux->attach_btf_id;
mutex_lock(&targets_mutex);
- list_for_each_entry(tinfo, &targets, list) {
- if (tinfo->btf_id == prog_btf_id) {
- existed = true;
+ list_for_each_entry(iter, &targets, list) {
+ if (iter->btf_id == prog_btf_id) {
+ tinfo = iter;
break;
}
}
mutex_unlock(&targets_mutex);
- if (!existed)
+ if (!tinfo)
return -ENOENT;
link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
@@ -547,7 +545,7 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog);
link->tinfo = tinfo;
- err = bpf_link_prime(&link->link, &link_primer);
+ err = bpf_link_prime(&link->link, &link_primer);
if (err) {
kfree(link);
return err;
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 01aa2b51ec4d..8ce40fd869f6 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -106,7 +106,7 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
*/
bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
struct bpf_local_storage_elem *selem,
- bool uncharge_mem)
+ bool uncharge_mem, bool use_trace_rcu)
{
struct bpf_local_storage_map *smap;
bool free_local_storage;
@@ -150,11 +150,16 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
SDATA(selem))
RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
- call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
+ if (use_trace_rcu)
+ call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
+ else
+ kfree_rcu(selem, rcu);
+
return free_local_storage;
}
-static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
+static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
+ bool use_trace_rcu)
{
struct bpf_local_storage *local_storage;
bool free_local_storage = false;
@@ -169,12 +174,16 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
raw_spin_lock_irqsave(&local_storage->lock, flags);
if (likely(selem_linked_to_storage(selem)))
free_local_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, true);
+ local_storage, selem, true, use_trace_rcu);
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
- if (free_local_storage)
- call_rcu_tasks_trace(&local_storage->rcu,
+ if (free_local_storage) {
+ if (use_trace_rcu)
+ call_rcu_tasks_trace(&local_storage->rcu,
bpf_local_storage_free_rcu);
+ else
+ kfree_rcu(local_storage, rcu);
+ }
}
void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
@@ -214,14 +223,14 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap,
raw_spin_unlock_irqrestore(&b->lock, flags);
}
-void bpf_selem_unlink(struct bpf_local_storage_elem *selem)
+void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu)
{
/* Always unlink from map before unlinking from local_storage
* because selem will be freed after successfully unlinked from
* the local_storage.
*/
bpf_selem_unlink_map(selem);
- __bpf_selem_unlink_storage(selem);
+ __bpf_selem_unlink_storage(selem, use_trace_rcu);
}
struct bpf_local_storage_data *
@@ -466,7 +475,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (old_sdata) {
bpf_selem_unlink_map(SELEM(old_sdata));
bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
- false);
+ false, true);
}
unlock:
@@ -548,7 +557,7 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
migrate_disable();
__this_cpu_inc(*busy_counter);
}
- bpf_selem_unlink(selem);
+ bpf_selem_unlink(selem, false);
if (busy_counter) {
__this_cpu_dec(*busy_counter);
migrate_enable();
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index 6b12f06ee18c..4ea227c9c1ad 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -4,6 +4,7 @@
#ifndef __BPF_LRU_LIST_H_
#define __BPF_LRU_LIST_H_
+#include <linux/cache.h>
#include <linux/list.h>
#include <linux/spinlock_types.h>
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 064eccba641d..c1351df9f7ee 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -117,6 +117,21 @@ static const struct bpf_func_proto bpf_ima_file_hash_proto = {
.allowed = bpf_ima_inode_hash_allowed,
};
+BPF_CALL_1(bpf_get_attach_cookie, void *, ctx)
+{
+ struct bpf_trace_run_ctx *run_ctx;
+
+ run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
+ return run_ctx->bpf_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto = {
+ .func = bpf_get_attach_cookie,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
static const struct bpf_func_proto *
bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -141,6 +156,8 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL;
case BPF_FUNC_ima_file_hash:
return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL;
+ case BPF_FUNC_get_attach_cookie:
+ return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL;
default:
return tracing_prog_func_proto(func_id, prog);
}
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 21069dbe9138..d9a3c9207240 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -10,6 +10,7 @@
#include <linux/seq_file.h>
#include <linux/refcount.h>
#include <linux/mutex.h>
+#include <linux/btf_ids.h>
enum bpf_struct_ops_state {
BPF_STRUCT_OPS_STATE_INIT,
@@ -32,15 +33,15 @@ struct bpf_struct_ops_map {
const struct bpf_struct_ops *st_ops;
/* protect map_update */
struct mutex lock;
- /* progs has all the bpf_prog that is populated
+ /* link has all the bpf_links that is populated
* to the func ptr of the kernel's struct
* (in kvalue.data).
*/
- struct bpf_prog **progs;
+ struct bpf_link **links;
/* image is a page that has all the trampolines
* that stores the func args before calling the bpf_prog.
* A PAGE_SIZE "image" is enough to store all trampoline for
- * "progs[]".
+ * "links[]".
*/
void *image;
/* uvalue->data stores the kernel struct
@@ -263,7 +264,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
/* No lock is needed. state and refcnt do not need
* to be updated together under atomic context.
*/
- uvalue = (struct bpf_struct_ops_value *)value;
+ uvalue = value;
memcpy(uvalue, st_map->uvalue, map->value_size);
uvalue->state = state;
refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt));
@@ -282,9 +283,9 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
u32 i;
for (i = 0; i < btf_type_vlen(t); i++) {
- if (st_map->progs[i]) {
- bpf_prog_put(st_map->progs[i]);
- st_map->progs[i] = NULL;
+ if (st_map->links[i]) {
+ bpf_link_put(st_map->links[i]);
+ st_map->links[i] = NULL;
}
}
}
@@ -315,18 +316,34 @@ static int check_zero_holes(const struct btf_type *t, void *data)
return 0;
}
-int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_progs *tprogs,
- struct bpf_prog *prog,
+static void bpf_struct_ops_link_release(struct bpf_link *link)
+{
+}
+
+static void bpf_struct_ops_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_tramp_link *tlink = container_of(link, struct bpf_tramp_link, link);
+
+ kfree(tlink);
+}
+
+const struct bpf_link_ops bpf_struct_ops_link_lops = {
+ .release = bpf_struct_ops_link_release,
+ .dealloc = bpf_struct_ops_link_dealloc,
+};
+
+int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
+ struct bpf_tramp_link *link,
const struct btf_func_model *model,
void *image, void *image_end)
{
u32 flags;
- tprogs[BPF_TRAMP_FENTRY].progs[0] = prog;
- tprogs[BPF_TRAMP_FENTRY].nr_progs = 1;
+ tlinks[BPF_TRAMP_FENTRY].links[0] = link;
+ tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0;
return arch_prepare_bpf_trampoline(NULL, image, image_end,
- model, flags, tprogs, NULL);
+ model, flags, tlinks, NULL);
}
static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
@@ -337,7 +354,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
struct bpf_struct_ops_value *uvalue, *kvalue;
const struct btf_member *member;
const struct btf_type *t = st_ops->type;
- struct bpf_tramp_progs *tprogs = NULL;
+ struct bpf_tramp_links *tlinks = NULL;
void *udata, *kdata;
int prog_fd, err = 0;
void *image, *image_end;
@@ -353,7 +370,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
if (err)
return err;
- uvalue = (struct bpf_struct_ops_value *)value;
+ uvalue = value;
err = check_zero_holes(t, uvalue->data);
if (err)
return err;
@@ -361,8 +378,8 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
if (uvalue->state || refcount_read(&uvalue->refcnt))
return -EINVAL;
- tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL);
- if (!tprogs)
+ tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL);
+ if (!tlinks)
return -ENOMEM;
uvalue = (struct bpf_struct_ops_value *)st_map->uvalue;
@@ -385,6 +402,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
for_each_member(i, t, member) {
const struct btf_type *mtype, *ptype;
struct bpf_prog *prog;
+ struct bpf_tramp_link *link;
u32 moff;
moff = __btf_member_bit_offset(t, member) / 8;
@@ -438,16 +456,26 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
err = PTR_ERR(prog);
goto reset_unlock;
}
- st_map->progs[i] = prog;
if (prog->type != BPF_PROG_TYPE_STRUCT_OPS ||
prog->aux->attach_btf_id != st_ops->type_id ||
prog->expected_attach_type != i) {
+ bpf_prog_put(prog);
err = -EINVAL;
goto reset_unlock;
}
- err = bpf_struct_ops_prepare_trampoline(tprogs, prog,
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ bpf_prog_put(prog);
+ err = -ENOMEM;
+ goto reset_unlock;
+ }
+ bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS,
+ &bpf_struct_ops_link_lops, prog);
+ st_map->links[i] = &link->link;
+
+ err = bpf_struct_ops_prepare_trampoline(tlinks, link,
&st_ops->func_models[i],
image, image_end);
if (err < 0)
@@ -490,7 +518,7 @@ reset_unlock:
memset(uvalue, 0, map->value_size);
memset(kvalue, 0, map->value_size);
unlock:
- kfree(tprogs);
+ kfree(tlinks);
mutex_unlock(&st_map->lock);
return err;
}
@@ -545,9 +573,9 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
- if (st_map->progs)
+ if (st_map->links)
bpf_struct_ops_map_put_progs(st_map);
- bpf_map_area_free(st_map->progs);
+ bpf_map_area_free(st_map->links);
bpf_jit_free_exec(st_map->image);
bpf_map_area_free(st_map->uvalue);
bpf_map_area_free(st_map);
@@ -596,11 +624,11 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
map = &st_map->map;
st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
- st_map->progs =
- bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_prog *),
+ st_map->links =
+ bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_links *),
NUMA_NO_NODE);
st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
- if (!st_map->uvalue || !st_map->progs || !st_map->image) {
+ if (!st_map->uvalue || !st_map->links || !st_map->image) {
bpf_struct_ops_map_free(map);
return ERR_PTR(-ENOMEM);
}
@@ -612,7 +640,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
return map;
}
-static int bpf_struct_ops_map_btf_id;
+BTF_ID_LIST_SINGLE(bpf_struct_ops_map_btf_ids, struct, bpf_struct_ops_map)
const struct bpf_map_ops bpf_struct_ops_map_ops = {
.map_alloc_check = bpf_struct_ops_map_alloc_check,
.map_alloc = bpf_struct_ops_map_alloc,
@@ -622,8 +650,7 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
.map_delete_elem = bpf_struct_ops_map_delete_elem,
.map_update_elem = bpf_struct_ops_map_update_elem,
.map_seq_show_elem = bpf_struct_ops_map_seq_show_elem,
- .map_btf_name = "bpf_struct_ops_map",
- .map_btf_id = &bpf_struct_ops_map_btf_id,
+ .map_btf_id = &bpf_struct_ops_map_btf_ids[0],
};
/* "const void *" because some subsystem is
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 6638a0ecc3d2..e9014dc62682 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -102,7 +102,7 @@ void bpf_task_storage_free(struct task_struct *task)
*/
bpf_selem_unlink_map(selem);
free_task_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, false);
+ local_storage, selem, false, false);
}
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
bpf_task_storage_unlock();
@@ -192,7 +192,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map)
if (!sdata)
return -ENOENT;
- bpf_selem_unlink(SELEM(sdata));
+ bpf_selem_unlink(SELEM(sdata), true);
return 0;
}
@@ -307,7 +307,7 @@ static void task_storage_map_free(struct bpf_map *map)
bpf_local_storage_map_free(smap, &bpf_task_storage_busy);
}
-static int task_storage_map_btf_id;
+BTF_ID_LIST_SINGLE(task_storage_map_btf_ids, struct, bpf_local_storage_map)
const struct bpf_map_ops task_storage_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = bpf_local_storage_map_alloc_check,
@@ -318,8 +318,7 @@ const struct bpf_map_ops task_storage_map_ops = {
.map_update_elem = bpf_pid_task_storage_update_elem,
.map_delete_elem = bpf_pid_task_storage_delete_elem,
.map_check_btf = bpf_local_storage_map_check_btf,
- .map_btf_name = "bpf_local_storage_map",
- .map_btf_id = &task_storage_map_btf_id,
+ .map_btf_id = &task_storage_map_btf_ids[0],
.map_owner_storage_ptr = task_storage_ptr,
};
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 0918a39279f6..7bccaa4646e5 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -202,17 +202,25 @@ enum btf_kfunc_hook {
BTF_KFUNC_HOOK_XDP,
BTF_KFUNC_HOOK_TC,
BTF_KFUNC_HOOK_STRUCT_OPS,
+ BTF_KFUNC_HOOK_TRACING,
+ BTF_KFUNC_HOOK_SYSCALL,
BTF_KFUNC_HOOK_MAX,
};
enum {
BTF_KFUNC_SET_MAX_CNT = 32,
+ BTF_DTOR_KFUNC_MAX_CNT = 256,
};
struct btf_kfunc_set_tab {
struct btf_id_set *sets[BTF_KFUNC_HOOK_MAX][BTF_KFUNC_TYPE_MAX];
};
+struct btf_id_dtor_kfunc_tab {
+ u32 cnt;
+ struct btf_id_dtor_kfunc dtors[];
+};
+
struct btf {
void *data;
struct btf_type **types;
@@ -228,6 +236,7 @@ struct btf {
u32 id;
struct rcu_head rcu;
struct btf_kfunc_set_tab *kfunc_set_tab;
+ struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab;
/* split BTF support */
struct btf *base_btf;
@@ -1616,8 +1625,19 @@ free_tab:
btf->kfunc_set_tab = NULL;
}
+static void btf_free_dtor_kfunc_tab(struct btf *btf)
+{
+ struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab;
+
+ if (!tab)
+ return;
+ kfree(tab);
+ btf->dtor_kfunc_tab = NULL;
+}
+
static void btf_free(struct btf *btf)
{
+ btf_free_dtor_kfunc_tab(btf);
btf_free_kfunc_set_tab(btf);
kvfree(btf->types);
kvfree(btf->resolved_sizes);
@@ -3163,24 +3183,86 @@ static void btf_struct_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
+enum btf_field_type {
+ BTF_FIELD_SPIN_LOCK,
+ BTF_FIELD_TIMER,
+ BTF_FIELD_KPTR,
+};
+
+enum {
+ BTF_FIELD_IGNORE = 0,
+ BTF_FIELD_FOUND = 1,
+};
+
+struct btf_field_info {
+ u32 type_id;
+ u32 off;
+ enum bpf_kptr_type type;
+};
+
+static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
+ u32 off, int sz, struct btf_field_info *info)
+{
+ if (!__btf_type_is_struct(t))
+ return BTF_FIELD_IGNORE;
+ if (t->size != sz)
+ return BTF_FIELD_IGNORE;
+ info->off = off;
+ return BTF_FIELD_FOUND;
+}
+
+static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
+ u32 off, int sz, struct btf_field_info *info)
+{
+ enum bpf_kptr_type type;
+ u32 res_id;
+
+ /* For PTR, sz is always == 8 */
+ if (!btf_type_is_ptr(t))
+ return BTF_FIELD_IGNORE;
+ t = btf_type_by_id(btf, t->type);
+
+ if (!btf_type_is_type_tag(t))
+ return BTF_FIELD_IGNORE;
+ /* Reject extra tags */
+ if (btf_type_is_type_tag(btf_type_by_id(btf, t->type)))
+ return -EINVAL;
+ if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off)))
+ type = BPF_KPTR_UNREF;
+ else if (!strcmp("kptr_ref", __btf_name_by_offset(btf, t->name_off)))
+ type = BPF_KPTR_REF;
+ else
+ return -EINVAL;
+
+ /* Get the base type */
+ t = btf_type_skip_modifiers(btf, t->type, &res_id);
+ /* Only pointer to struct is allowed */
+ if (!__btf_type_is_struct(t))
+ return -EINVAL;
+
+ info->type_id = res_id;
+ info->off = off;
+ info->type = type;
+ return BTF_FIELD_FOUND;
+}
+
static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t,
- const char *name, int sz, int align)
+ const char *name, int sz, int align,
+ enum btf_field_type field_type,
+ struct btf_field_info *info, int info_cnt)
{
const struct btf_member *member;
- u32 i, off = -ENOENT;
+ struct btf_field_info tmp;
+ int ret, idx = 0;
+ u32 i, off;
for_each_member(i, t, member) {
const struct btf_type *member_type = btf_type_by_id(btf,
member->type);
- if (!__btf_type_is_struct(member_type))
- continue;
- if (member_type->size != sz)
- continue;
- if (strcmp(__btf_name_by_offset(btf, member_type->name_off), name))
+
+ if (name && strcmp(__btf_name_by_offset(btf, member_type->name_off), name))
continue;
- if (off != -ENOENT)
- /* only one such field is allowed */
- return -E2BIG;
+
off = __btf_member_bit_offset(t, member);
if (off % 8)
/* valid C code cannot generate such BTF */
@@ -3188,46 +3270,115 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t
off /= 8;
if (off % align)
return -EINVAL;
+
+ switch (field_type) {
+ case BTF_FIELD_SPIN_LOCK:
+ case BTF_FIELD_TIMER:
+ ret = btf_find_struct(btf, member_type, off, sz,
+ idx < info_cnt ? &info[idx] : &tmp);
+ if (ret < 0)
+ return ret;
+ break;
+ case BTF_FIELD_KPTR:
+ ret = btf_find_kptr(btf, member_type, off, sz,
+ idx < info_cnt ? &info[idx] : &tmp);
+ if (ret < 0)
+ return ret;
+ break;
+ default:
+ return -EFAULT;
+ }
+
+ if (ret == BTF_FIELD_IGNORE)
+ continue;
+ if (idx >= info_cnt)
+ return -E2BIG;
+ ++idx;
}
- return off;
+ return idx;
}
static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
- const char *name, int sz, int align)
+ const char *name, int sz, int align,
+ enum btf_field_type field_type,
+ struct btf_field_info *info, int info_cnt)
{
const struct btf_var_secinfo *vsi;
- u32 i, off = -ENOENT;
+ struct btf_field_info tmp;
+ int ret, idx = 0;
+ u32 i, off;
for_each_vsi(i, t, vsi) {
const struct btf_type *var = btf_type_by_id(btf, vsi->type);
const struct btf_type *var_type = btf_type_by_id(btf, var->type);
- if (!__btf_type_is_struct(var_type))
- continue;
- if (var_type->size != sz)
+ off = vsi->offset;
+
+ if (name && strcmp(__btf_name_by_offset(btf, var_type->name_off), name))
continue;
if (vsi->size != sz)
continue;
- if (strcmp(__btf_name_by_offset(btf, var_type->name_off), name))
- continue;
- if (off != -ENOENT)
- /* only one such field is allowed */
- return -E2BIG;
- off = vsi->offset;
if (off % align)
return -EINVAL;
+
+ switch (field_type) {
+ case BTF_FIELD_SPIN_LOCK:
+ case BTF_FIELD_TIMER:
+ ret = btf_find_struct(btf, var_type, off, sz,
+ idx < info_cnt ? &info[idx] : &tmp);
+ if (ret < 0)
+ return ret;
+ break;
+ case BTF_FIELD_KPTR:
+ ret = btf_find_kptr(btf, var_type, off, sz,
+ idx < info_cnt ? &info[idx] : &tmp);
+ if (ret < 0)
+ return ret;
+ break;
+ default:
+ return -EFAULT;
+ }
+
+ if (ret == BTF_FIELD_IGNORE)
+ continue;
+ if (idx >= info_cnt)
+ return -E2BIG;
+ ++idx;
}
- return off;
+ return idx;
}
static int btf_find_field(const struct btf *btf, const struct btf_type *t,
- const char *name, int sz, int align)
+ enum btf_field_type field_type,
+ struct btf_field_info *info, int info_cnt)
{
+ const char *name;
+ int sz, align;
+
+ switch (field_type) {
+ case BTF_FIELD_SPIN_LOCK:
+ name = "bpf_spin_lock";
+ sz = sizeof(struct bpf_spin_lock);
+ align = __alignof__(struct bpf_spin_lock);
+ break;
+ case BTF_FIELD_TIMER:
+ name = "bpf_timer";
+ sz = sizeof(struct bpf_timer);
+ align = __alignof__(struct bpf_timer);
+ break;
+ case BTF_FIELD_KPTR:
+ name = NULL;
+ sz = sizeof(u64);
+ align = 8;
+ break;
+ default:
+ return -EFAULT;
+ }
if (__btf_type_is_struct(t))
- return btf_find_struct_field(btf, t, name, sz, align);
+ return btf_find_struct_field(btf, t, name, sz, align, field_type, info, info_cnt);
else if (btf_type_is_datasec(t))
- return btf_find_datasec_var(btf, t, name, sz, align);
+ return btf_find_datasec_var(btf, t, name, sz, align, field_type, info, info_cnt);
return -EINVAL;
}
@@ -3237,16 +3388,130 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t,
*/
int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
{
- return btf_find_field(btf, t, "bpf_spin_lock",
- sizeof(struct bpf_spin_lock),
- __alignof__(struct bpf_spin_lock));
+ struct btf_field_info info;
+ int ret;
+
+ ret = btf_find_field(btf, t, BTF_FIELD_SPIN_LOCK, &info, 1);
+ if (ret < 0)
+ return ret;
+ if (!ret)
+ return -ENOENT;
+ return info.off;
}
int btf_find_timer(const struct btf *btf, const struct btf_type *t)
{
- return btf_find_field(btf, t, "bpf_timer",
- sizeof(struct bpf_timer),
- __alignof__(struct bpf_timer));
+ struct btf_field_info info;
+ int ret;
+
+ ret = btf_find_field(btf, t, BTF_FIELD_TIMER, &info, 1);
+ if (ret < 0)
+ return ret;
+ if (!ret)
+ return -ENOENT;
+ return info.off;
+}
+
+struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf,
+ const struct btf_type *t)
+{
+ struct btf_field_info info_arr[BPF_MAP_VALUE_OFF_MAX];
+ struct bpf_map_value_off *tab;
+ struct btf *kernel_btf = NULL;
+ struct module *mod = NULL;
+ int ret, i, nr_off;
+
+ ret = btf_find_field(btf, t, BTF_FIELD_KPTR, info_arr, ARRAY_SIZE(info_arr));
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (!ret)
+ return NULL;
+
+ nr_off = ret;
+ tab = kzalloc(offsetof(struct bpf_map_value_off, off[nr_off]), GFP_KERNEL | __GFP_NOWARN);
+ if (!tab)
+ return ERR_PTR(-ENOMEM);
+
+ for (i = 0; i < nr_off; i++) {
+ const struct btf_type *t;
+ s32 id;
+
+ /* Find type in map BTF, and use it to look up the matching type
+ * in vmlinux or module BTFs, by name and kind.
+ */
+ t = btf_type_by_id(btf, info_arr[i].type_id);
+ id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info),
+ &kernel_btf);
+ if (id < 0) {
+ ret = id;
+ goto end;
+ }
+
+ /* Find and stash the function pointer for the destruction function that
+ * needs to be eventually invoked from the map free path.
+ */
+ if (info_arr[i].type == BPF_KPTR_REF) {
+ const struct btf_type *dtor_func;
+ const char *dtor_func_name;
+ unsigned long addr;
+ s32 dtor_btf_id;
+
+ /* This call also serves as a whitelist of allowed objects that
+ * can be used as a referenced pointer and be stored in a map at
+ * the same time.
+ */
+ dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id);
+ if (dtor_btf_id < 0) {
+ ret = dtor_btf_id;
+ goto end_btf;
+ }
+
+ dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id);
+ if (!dtor_func) {
+ ret = -ENOENT;
+ goto end_btf;
+ }
+
+ if (btf_is_module(kernel_btf)) {
+ mod = btf_try_get_module(kernel_btf);
+ if (!mod) {
+ ret = -ENXIO;
+ goto end_btf;
+ }
+ }
+
+ /* We already verified dtor_func to be btf_type_is_func
+ * in register_btf_id_dtor_kfuncs.
+ */
+ dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off);
+ addr = kallsyms_lookup_name(dtor_func_name);
+ if (!addr) {
+ ret = -EINVAL;
+ goto end_mod;
+ }
+ tab->off[i].kptr.dtor = (void *)addr;
+ }
+
+ tab->off[i].offset = info_arr[i].off;
+ tab->off[i].type = info_arr[i].type;
+ tab->off[i].kptr.btf_id = id;
+ tab->off[i].kptr.btf = kernel_btf;
+ tab->off[i].kptr.module = mod;
+ }
+ tab->nr_off = nr_off;
+ return tab;
+end_mod:
+ module_put(mod);
+end_btf:
+ btf_put(kernel_btf);
+end:
+ while (i--) {
+ btf_put(tab->off[i].kptr.btf);
+ if (tab->off[i].kptr.module)
+ module_put(tab->off[i].kptr.module);
+ }
+ kfree(tab);
+ return ERR_PTR(ret);
}
static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
@@ -4541,6 +4806,48 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
return 0;
}
+static int btf_check_type_tags(struct btf_verifier_env *env,
+ struct btf *btf, int start_id)
+{
+ int i, n, good_id = start_id - 1;
+ bool in_tags;
+
+ n = btf_nr_types(btf);
+ for (i = start_id; i < n; i++) {
+ const struct btf_type *t;
+ u32 cur_id = i;
+
+ t = btf_type_by_id(btf, i);
+ if (!t)
+ return -EINVAL;
+ if (!btf_type_is_modifier(t))
+ continue;
+
+ cond_resched();
+
+ in_tags = btf_type_is_type_tag(t);
+ while (btf_type_is_modifier(t)) {
+ if (btf_type_is_type_tag(t)) {
+ if (!in_tags) {
+ btf_verifier_log(env, "Type tags don't precede modifiers");
+ return -EINVAL;
+ }
+ } else if (in_tags) {
+ in_tags = false;
+ }
+ if (cur_id <= good_id)
+ break;
+ /* Move to next type */
+ cur_id = t->type;
+ t = btf_type_by_id(btf, cur_id);
+ if (!t)
+ return -EINVAL;
+ }
+ good_id = i;
+ }
+ return 0;
+}
+
static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
u32 log_level, char __user *log_ubuf, u32 log_size)
{
@@ -4608,6 +4915,10 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
if (err)
goto errout;
+ err = btf_check_type_tags(env, btf, 1);
+ if (err)
+ goto errout;
+
if (log->level && bpf_verifier_log_full(log)) {
err = -ENOSPC;
goto errout;
@@ -4716,41 +5027,6 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
return ctx_type;
}
-static const struct bpf_map_ops * const btf_vmlinux_map_ops[] = {
-#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
-#define BPF_LINK_TYPE(_id, _name)
-#define BPF_MAP_TYPE(_id, _ops) \
- [_id] = &_ops,
-#include <linux/bpf_types.h>
-#undef BPF_PROG_TYPE
-#undef BPF_LINK_TYPE
-#undef BPF_MAP_TYPE
-};
-
-static int btf_vmlinux_map_ids_init(const struct btf *btf,
- struct bpf_verifier_log *log)
-{
- const struct bpf_map_ops *ops;
- int i, btf_id;
-
- for (i = 0; i < ARRAY_SIZE(btf_vmlinux_map_ops); ++i) {
- ops = btf_vmlinux_map_ops[i];
- if (!ops || (!ops->map_btf_name && !ops->map_btf_id))
- continue;
- if (!ops->map_btf_name || !ops->map_btf_id) {
- bpf_log(log, "map type %d is misconfigured\n", i);
- return -EINVAL;
- }
- btf_id = btf_find_by_name_kind(btf, ops->map_btf_name,
- BTF_KIND_STRUCT);
- if (btf_id < 0)
- return btf_id;
- *ops->map_btf_id = btf_id;
- }
-
- return 0;
-}
-
static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
struct btf *btf,
const struct btf_type *t,
@@ -4809,14 +5085,13 @@ struct btf *btf_parse_vmlinux(void)
if (err)
goto errout;
+ err = btf_check_type_tags(env, btf, 1);
+ if (err)
+ goto errout;
+
/* btf_parse_vmlinux() runs under bpf_verifier_lock */
bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
- /* find bpf map structs for map_ptr access checking */
- err = btf_vmlinux_map_ids_init(btf, log);
- if (err < 0)
- goto errout;
-
bpf_struct_ops_init(btf, log);
refcount_set(&btf->refcnt, 1);
@@ -4894,6 +5169,10 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u
if (err)
goto errout;
+ err = btf_check_type_tags(env, btf, btf_nr_types(base_btf));
+ if (err)
+ goto errout;
+
btf_verifier_env_free(env);
refcount_set(&btf->refcnt, 1);
return btf;
@@ -5429,7 +5708,8 @@ static bool btf_types_are_same(const struct btf *btf1, u32 id1,
bool btf_struct_ids_match(struct bpf_verifier_log *log,
const struct btf *btf, u32 id, int off,
- const struct btf *need_btf, u32 need_type_id)
+ const struct btf *need_btf, u32 need_type_id,
+ bool strict)
{
const struct btf_type *type;
enum bpf_type_flag flag;
@@ -5438,7 +5718,12 @@ bool btf_struct_ids_match(struct bpf_verifier_log *log,
/* Are we already done? */
if (off == 0 && btf_types_are_same(btf, id, need_btf, need_type_id))
return true;
-
+ /* In case of strict type match, we do not walk struct, the top level
+ * type match must succeed. When strict is true, off should have already
+ * been 0.
+ */
+ if (strict)
+ return false;
again:
type = btf_type_by_id(btf, id);
if (!type)
@@ -5772,11 +6057,11 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
struct bpf_verifier_log *log = &env->log;
u32 i, nargs, ref_id, ref_obj_id = 0;
bool is_kfunc = btf_is_kernel(btf);
+ bool rel = false, kptr_get = false;
const char *func_name, *ref_tname;
const struct btf_type *t, *ref_t;
const struct btf_param *args;
int ref_regno = 0, ret;
- bool rel = false;
t = btf_type_by_id(btf, func_id);
if (!t || !btf_type_is_func(t)) {
@@ -5802,14 +6087,19 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
return -EINVAL;
}
- /* Only kfunc can be release func */
- if (is_kfunc)
+ if (is_kfunc) {
+ /* Only kfunc can be release func */
rel = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog),
BTF_KFUNC_TYPE_RELEASE, func_id);
+ kptr_get = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog),
+ BTF_KFUNC_TYPE_KPTR_ACQUIRE, func_id);
+ }
+
/* check that BTF function arguments match actual types that the
* verifier sees.
*/
for (i = 0; i < nargs; i++) {
+ enum bpf_arg_type arg_type = ARG_DONTCARE;
u32 regno = i + 1;
struct bpf_reg_state *reg = &regs[regno];
@@ -5830,12 +6120,58 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
ref_tname = btf_name_by_offset(btf, ref_t->name_off);
- ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE, rel);
+ if (rel && reg->ref_obj_id)
+ arg_type |= OBJ_RELEASE;
+ ret = check_func_arg_reg_off(env, reg, regno, arg_type);
if (ret < 0)
return ret;
- if (btf_get_prog_ctx_type(log, btf, t,
- env->prog->type, i)) {
+ /* kptr_get is only true for kfunc */
+ if (i == 0 && kptr_get) {
+ struct bpf_map_value_off_desc *off_desc;
+
+ if (reg->type != PTR_TO_MAP_VALUE) {
+ bpf_log(log, "arg#0 expected pointer to map value\n");
+ return -EINVAL;
+ }
+
+ /* check_func_arg_reg_off allows var_off for
+ * PTR_TO_MAP_VALUE, but we need fixed offset to find
+ * off_desc.
+ */
+ if (!tnum_is_const(reg->var_off)) {
+ bpf_log(log, "arg#0 must have constant offset\n");
+ return -EINVAL;
+ }
+
+ off_desc = bpf_map_kptr_off_contains(reg->map_ptr, reg->off + reg->var_off.value);
+ if (!off_desc || off_desc->type != BPF_KPTR_REF) {
+ bpf_log(log, "arg#0 no referenced kptr at map value offset=%llu\n",
+ reg->off + reg->var_off.value);
+ return -EINVAL;
+ }
+
+ if (!btf_type_is_ptr(ref_t)) {
+ bpf_log(log, "arg#0 BTF type must be a double pointer\n");
+ return -EINVAL;
+ }
+
+ ref_t = btf_type_skip_modifiers(btf, ref_t->type, &ref_id);
+ ref_tname = btf_name_by_offset(btf, ref_t->name_off);
+
+ if (!btf_type_is_struct(ref_t)) {
+ bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n",
+ func_name, i, btf_type_str(ref_t), ref_tname);
+ return -EINVAL;
+ }
+ if (!btf_struct_ids_match(log, btf, ref_id, 0, off_desc->kptr.btf,
+ off_desc->kptr.btf_id, true)) {
+ bpf_log(log, "kernel function %s args#%d expected pointer to %s %s\n",
+ func_name, i, btf_type_str(ref_t), ref_tname);
+ return -EINVAL;
+ }
+ /* rest of the arguments can be anything, like normal kfunc */
+ } else if (btf_get_prog_ctx_type(log, btf, t, env->prog->type, i)) {
/* If function expects ctx type in BTF check that caller
* is passing PTR_TO_CTX.
*/
@@ -5862,11 +6198,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
if (reg->type == PTR_TO_BTF_ID) {
reg_btf = reg->btf;
reg_ref_id = reg->btf_id;
- /* Ensure only one argument is referenced
- * PTR_TO_BTF_ID, check_func_arg_reg_off relies
- * on only one referenced register being allowed
- * for kfuncs.
- */
+ /* Ensure only one argument is referenced PTR_TO_BTF_ID */
if (reg->ref_obj_id) {
if (ref_obj_id) {
bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
@@ -5886,7 +6218,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
reg_ref_tname = btf_name_by_offset(reg_btf,
reg_ref_t->name_off);
if (!btf_struct_ids_match(log, reg_btf, reg_ref_id,
- reg->off, btf, ref_id)) {
+ reg->off, btf, ref_id, rel && reg->ref_obj_id)) {
bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
func_name, i,
btf_type_str(ref_t), ref_tname,
@@ -6780,6 +7112,10 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
return BTF_KFUNC_HOOK_TC;
case BPF_PROG_TYPE_STRUCT_OPS:
return BTF_KFUNC_HOOK_STRUCT_OPS;
+ case BPF_PROG_TYPE_TRACING:
+ return BTF_KFUNC_HOOK_TRACING;
+ case BPF_PROG_TYPE_SYSCALL:
+ return BTF_KFUNC_HOOK_SYSCALL;
default:
return BTF_KFUNC_HOOK_MAX;
}
@@ -6832,6 +7168,138 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
}
EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set);
+s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id)
+{
+ struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab;
+ struct btf_id_dtor_kfunc *dtor;
+
+ if (!tab)
+ return -ENOENT;
+ /* Even though the size of tab->dtors[0] is > sizeof(u32), we only need
+ * to compare the first u32 with btf_id, so we can reuse btf_id_cmp_func.
+ */
+ BUILD_BUG_ON(offsetof(struct btf_id_dtor_kfunc, btf_id) != 0);
+ dtor = bsearch(&btf_id, tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func);
+ if (!dtor)
+ return -ENOENT;
+ return dtor->kfunc_btf_id;
+}
+
+static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc *dtors, u32 cnt)
+{
+ const struct btf_type *dtor_func, *dtor_func_proto, *t;
+ const struct btf_param *args;
+ s32 dtor_btf_id;
+ u32 nr_args, i;
+
+ for (i = 0; i < cnt; i++) {
+ dtor_btf_id = dtors[i].kfunc_btf_id;
+
+ dtor_func = btf_type_by_id(btf, dtor_btf_id);
+ if (!dtor_func || !btf_type_is_func(dtor_func))
+ return -EINVAL;
+
+ dtor_func_proto = btf_type_by_id(btf, dtor_func->type);
+ if (!dtor_func_proto || !btf_type_is_func_proto(dtor_func_proto))
+ return -EINVAL;
+
+ /* Make sure the prototype of the destructor kfunc is 'void func(type *)' */
+ t = btf_type_by_id(btf, dtor_func_proto->type);
+ if (!t || !btf_type_is_void(t))
+ return -EINVAL;
+
+ nr_args = btf_type_vlen(dtor_func_proto);
+ if (nr_args != 1)
+ return -EINVAL;
+ args = btf_params(dtor_func_proto);
+ t = btf_type_by_id(btf, args[0].type);
+ /* Allow any pointer type, as width on targets Linux supports
+ * will be same for all pointer types (i.e. sizeof(void *))
+ */
+ if (!t || !btf_type_is_ptr(t))
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt,
+ struct module *owner)
+{
+ struct btf_id_dtor_kfunc_tab *tab;
+ struct btf *btf;
+ u32 tab_cnt;
+ int ret;
+
+ btf = btf_get_module_btf(owner);
+ if (!btf) {
+ if (!owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
+ pr_err("missing vmlinux BTF, cannot register dtor kfuncs\n");
+ return -ENOENT;
+ }
+ if (owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) {
+ pr_err("missing module BTF, cannot register dtor kfuncs\n");
+ return -ENOENT;
+ }
+ return 0;
+ }
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+
+ if (add_cnt >= BTF_DTOR_KFUNC_MAX_CNT) {
+ pr_err("cannot register more than %d kfunc destructors\n", BTF_DTOR_KFUNC_MAX_CNT);
+ ret = -E2BIG;
+ goto end;
+ }
+
+ /* Ensure that the prototype of dtor kfuncs being registered is sane */
+ ret = btf_check_dtor_kfuncs(btf, dtors, add_cnt);
+ if (ret < 0)
+ goto end;
+
+ tab = btf->dtor_kfunc_tab;
+ /* Only one call allowed for modules */
+ if (WARN_ON_ONCE(tab && btf_is_module(btf))) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ tab_cnt = tab ? tab->cnt : 0;
+ if (tab_cnt > U32_MAX - add_cnt) {
+ ret = -EOVERFLOW;
+ goto end;
+ }
+ if (tab_cnt + add_cnt >= BTF_DTOR_KFUNC_MAX_CNT) {
+ pr_err("cannot register more than %d kfunc destructors\n", BTF_DTOR_KFUNC_MAX_CNT);
+ ret = -E2BIG;
+ goto end;
+ }
+
+ tab = krealloc(btf->dtor_kfunc_tab,
+ offsetof(struct btf_id_dtor_kfunc_tab, dtors[tab_cnt + add_cnt]),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!tab) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ if (!btf->dtor_kfunc_tab)
+ tab->cnt = 0;
+ btf->dtor_kfunc_tab = tab;
+
+ memcpy(tab->dtors + tab->cnt, dtors, add_cnt * sizeof(tab->dtors[0]));
+ tab->cnt += add_cnt;
+
+ sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL);
+
+ return 0;
+end:
+ btf_free_dtor_kfunc_tab(btf);
+ btf_put(btf);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_btf_id_dtor_kfuncs);
+
#define MAX_TYPES_ARE_COMPAT_DEPTH 2
static
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 128028efda64..afb414b26d01 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -22,6 +22,45 @@
DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
EXPORT_SYMBOL(cgroup_bpf_enabled_key);
+/* __always_inline is necessary to prevent indirect call through run_prog
+ * function pointer.
+ */
+static __always_inline int
+bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
+ enum cgroup_bpf_attach_type atype,
+ const void *ctx, bpf_prog_run_fn run_prog,
+ int retval, u32 *ret_flags)
+{
+ const struct bpf_prog_array_item *item;
+ const struct bpf_prog *prog;
+ const struct bpf_prog_array *array;
+ struct bpf_run_ctx *old_run_ctx;
+ struct bpf_cg_run_ctx run_ctx;
+ u32 func_ret;
+
+ run_ctx.retval = retval;
+ migrate_disable();
+ rcu_read_lock();
+ array = rcu_dereference(cgrp->effective[atype]);
+ item = &array->items[0];
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+ while ((prog = READ_ONCE(item->prog))) {
+ run_ctx.prog_item = item;
+ func_ret = run_prog(prog, ctx);
+ if (ret_flags) {
+ *(ret_flags) |= (func_ret >> 1);
+ func_ret &= 1;
+ }
+ if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
+ run_ctx.retval = -EPERM;
+ item++;
+ }
+ bpf_reset_run_ctx(old_run_ctx);
+ rcu_read_unlock();
+ migrate_enable();
+ return run_ctx.retval;
+}
+
void cgroup_bpf_offline(struct cgroup *cgrp)
{
cgroup_get(cgrp);
@@ -1075,11 +1114,38 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
bpf_compute_and_save_data_end(skb, &saved_data_end);
if (atype == CGROUP_INET_EGRESS) {
- ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
- cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb);
+ u32 flags = 0;
+ bool cn;
+
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, skb,
+ __bpf_prog_run_save_cb, 0, &flags);
+
+ /* Return values of CGROUP EGRESS BPF programs are:
+ * 0: drop packet
+ * 1: keep packet
+ * 2: drop packet and cn
+ * 3: keep packet and cn
+ *
+ * The returned value is then converted to one of the NET_XMIT
+ * or an error code that is then interpreted as drop packet
+ * (and no cn):
+ * 0: NET_XMIT_SUCCESS skb should be transmitted
+ * 1: NET_XMIT_DROP skb should be dropped and cn
+ * 2: NET_XMIT_CN skb should be transmitted and cn
+ * 3: -err skb should be dropped
+ */
+
+ cn = flags & BPF_RET_SET_CN;
+ if (ret && !IS_ERR_VALUE((long)ret))
+ ret = -EFAULT;
+ if (!ret)
+ ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);
+ else
+ ret = (cn ? NET_XMIT_DROP : ret);
} else {
- ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb,
- __bpf_prog_run_save_cb, 0);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, atype,
+ skb, __bpf_prog_run_save_cb, 0,
+ NULL);
if (ret && !IS_ERR_VALUE((long)ret))
ret = -EFAULT;
}
@@ -1109,8 +1175,8 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk,
- bpf_prog_run, 0);
+ return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0,
+ NULL);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
@@ -1155,8 +1221,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
}
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- return BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx,
- bpf_prog_run, 0, flags);
+ return bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
+ 0, flags);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
@@ -1182,8 +1248,8 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops,
- bpf_prog_run, 0);
+ return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run,
+ 0, NULL);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
@@ -1200,8 +1266,8 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
rcu_read_lock();
cgrp = task_dfl_cgroup(current);
- ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
- bpf_prog_run, 0);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
+ NULL);
rcu_read_unlock();
return ret;
@@ -1366,8 +1432,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
rcu_read_lock();
cgrp = task_dfl_cgroup(current);
- ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
- bpf_prog_run, 0);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
+ NULL);
rcu_read_unlock();
kfree(ctx.cur_val);
@@ -1459,8 +1525,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
}
lock_sock(sk);
- ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT],
- &ctx, bpf_prog_run, 0);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT,
+ &ctx, bpf_prog_run, 0, NULL);
release_sock(sk);
if (ret)
@@ -1559,8 +1625,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
}
lock_sock(sk);
- ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
- &ctx, bpf_prog_run, retval);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
+ &ctx, bpf_prog_run, retval, NULL);
release_sock(sk);
if (ret < 0)
@@ -1608,8 +1674,8 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
* be called if that data shouldn't be "exported".
*/
- ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
- &ctx, bpf_prog_run, retval);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
+ &ctx, bpf_prog_run, retval, NULL);
if (ret < 0)
return ret;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 13e9dbeeedf3..5f6f3f829b36 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -873,7 +873,7 @@ static size_t select_bpf_prog_pack_size(void)
return size;
}
-static struct bpf_prog_pack *alloc_new_pack(void)
+static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
struct bpf_prog_pack *pack;
@@ -886,6 +886,7 @@ static struct bpf_prog_pack *alloc_new_pack(void)
kfree(pack);
return NULL;
}
+ bpf_fill_ill_insns(pack->ptr, bpf_prog_pack_size);
bitmap_zero(pack->bitmap, bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE);
list_add_tail(&pack->list, &pack_list);
@@ -895,7 +896,7 @@ static struct bpf_prog_pack *alloc_new_pack(void)
return pack;
}
-static void *bpf_prog_pack_alloc(u32 size)
+static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
struct bpf_prog_pack *pack;
@@ -910,6 +911,7 @@ static void *bpf_prog_pack_alloc(u32 size)
size = round_up(size, PAGE_SIZE);
ptr = module_alloc(size);
if (ptr) {
+ bpf_fill_ill_insns(ptr, size);
set_vm_flush_reset_perms(ptr);
set_memory_ro((unsigned long)ptr, size / PAGE_SIZE);
set_memory_x((unsigned long)ptr, size / PAGE_SIZE);
@@ -923,7 +925,7 @@ static void *bpf_prog_pack_alloc(u32 size)
goto found_free_area;
}
- pack = alloc_new_pack();
+ pack = alloc_new_pack(bpf_fill_ill_insns);
if (!pack)
goto out;
@@ -966,6 +968,9 @@ static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT;
+ WARN_ONCE(bpf_arch_text_invalidate(hdr, hdr->size),
+ "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n");
+
bitmap_clear(pack->bitmap, pos, nbits);
if (bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0,
bpf_prog_chunk_count(), 0) == 0) {
@@ -1102,7 +1107,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
if (bpf_jit_charge_modmem(size))
return NULL;
- ro_header = bpf_prog_pack_alloc(size);
+ ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns);
if (!ro_header) {
bpf_jit_uncharge_modmem(size);
return NULL;
@@ -1434,6 +1439,16 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
insn = clone->insnsi;
for (i = 0; i < insn_cnt; i++, insn++) {
+ if (bpf_pseudo_func(insn)) {
+ /* ld_imm64 with an address of bpf subprog is not
+ * a user controlled constant. Don't randomize it,
+ * since it will conflict with jit_subprogs() logic.
+ */
+ insn++;
+ i++;
+ continue;
+ }
+
/* We temporarily need to hold the original ld64 insn
* so that we can still access the first part in the
* second blinding run.
@@ -1938,6 +1953,11 @@ out:
CONT; \
LDX_MEM_##SIZEOP: \
DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
+ CONT; \
+ LDX_PROBE_MEM_##SIZEOP: \
+ bpf_probe_read_kernel(&DST, sizeof(SIZE), \
+ (const void *)(long) (SRC + insn->off)); \
+ DST = *((SIZE *)&DST); \
CONT;
LDST(B, u8)
@@ -1945,15 +1965,6 @@ out:
LDST(W, u32)
LDST(DW, u64)
#undef LDST
-#define LDX_PROBE(SIZEOP, SIZE) \
- LDX_PROBE_MEM_##SIZEOP: \
- bpf_probe_read_kernel(&DST, SIZE, (const void *)(long) (SRC + insn->off)); \
- CONT;
- LDX_PROBE(B, 1)
- LDX_PROBE(H, 2)
- LDX_PROBE(W, 4)
- LDX_PROBE(DW, 8)
-#undef LDX_PROBE
#define ATOMIC_ALU_OP(BOP, KOP) \
case BOP: \
@@ -2619,6 +2630,7 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
const struct bpf_func_proto bpf_map_push_elem_proto __weak;
const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
+const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto __weak;
const struct bpf_func_proto bpf_spin_lock_proto __weak;
const struct bpf_func_proto bpf_spin_unlock_proto __weak;
const struct bpf_func_proto bpf_jiffies64_proto __weak;
@@ -2727,6 +2739,11 @@ void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len)
return ERR_PTR(-ENOTSUPP);
}
+int __weak bpf_arch_text_invalidate(void *dst, size_t len)
+{
+ return -ENOTSUPP;
+}
+
DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
EXPORT_SYMBOL(bpf_stats_enabled_key);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 650e5d21f90d..f4860ac756cd 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -27,6 +27,7 @@
#include <linux/kthread.h>
#include <linux/capability.h>
#include <trace/events/xdp.h>
+#include <linux/btf_ids.h>
#include <linux/netdevice.h> /* netif_receive_skb_list */
#include <linux/etherdevice.h> /* eth_type_trans */
@@ -673,7 +674,7 @@ static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
__cpu_map_lookup_elem);
}
-static int cpu_map_btf_id;
+BTF_ID_LIST_SINGLE(cpu_map_btf_ids, struct, bpf_cpu_map)
const struct bpf_map_ops cpu_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = cpu_map_alloc,
@@ -683,8 +684,7 @@ const struct bpf_map_ops cpu_map_ops = {
.map_lookup_elem = cpu_map_lookup_elem,
.map_get_next_key = cpu_map_get_next_key,
.map_check_btf = map_check_no_btf,
- .map_btf_name = "bpf_cpu_map",
- .map_btf_id = &cpu_map_btf_id,
+ .map_btf_id = &cpu_map_btf_ids[0],
.map_redirect = cpu_map_redirect,
};
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 038f6d7a83e4..c2867068e5bd 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -48,6 +48,7 @@
#include <net/xdp.h>
#include <linux/filter.h>
#include <trace/events/xdp.h>
+#include <linux/btf_ids.h>
#define DEV_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
@@ -1005,7 +1006,7 @@ static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
__dev_map_hash_lookup_elem);
}
-static int dev_map_btf_id;
+BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab)
const struct bpf_map_ops dev_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = dev_map_alloc,
@@ -1015,12 +1016,10 @@ const struct bpf_map_ops dev_map_ops = {
.map_update_elem = dev_map_update_elem,
.map_delete_elem = dev_map_delete_elem,
.map_check_btf = map_check_no_btf,
- .map_btf_name = "bpf_dtab",
- .map_btf_id = &dev_map_btf_id,
+ .map_btf_id = &dev_map_btf_ids[0],
.map_redirect = dev_map_redirect,
};
-static int dev_map_hash_map_btf_id;
const struct bpf_map_ops dev_map_hash_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = dev_map_alloc,
@@ -1030,8 +1029,7 @@ const struct bpf_map_ops dev_map_hash_ops = {
.map_update_elem = dev_map_hash_update_elem,
.map_delete_elem = dev_map_hash_delete_elem,
.map_check_btf = map_check_no_btf,
- .map_btf_name = "bpf_dtab",
- .map_btf_id = &dev_map_hash_map_btf_id,
+ .map_btf_id = &dev_map_btf_ids[0],
.map_redirect = dev_hash_map_redirect,
};
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 65877967f414..17fb69c0e0dc 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -10,6 +10,7 @@
#include <linux/random.h>
#include <uapi/linux/btf.h>
#include <linux/rcupdate_trace.h>
+#include <linux/btf_ids.h>
#include "percpu_freelist.h"
#include "bpf_lru_list.h"
#include "map_in_map.h"
@@ -139,7 +140,7 @@ static inline bool htab_use_raw_lock(const struct bpf_htab *htab)
static void htab_init_buckets(struct bpf_htab *htab)
{
- unsigned i;
+ unsigned int i;
for (i = 0; i < htab->n_buckets; i++) {
INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
@@ -238,7 +239,7 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab)
u32 num_entries = htab->map.max_entries;
int i;
- if (likely(!map_value_has_timer(&htab->map)))
+ if (!map_value_has_timer(&htab->map))
return;
if (htab_has_extra_elems(htab))
num_entries += num_possible_cpus();
@@ -254,6 +255,25 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab)
}
}
+static void htab_free_prealloced_kptrs(struct bpf_htab *htab)
+{
+ u32 num_entries = htab->map.max_entries;
+ int i;
+
+ if (!map_value_has_kptrs(&htab->map))
+ return;
+ if (htab_has_extra_elems(htab))
+ num_entries += num_possible_cpus();
+
+ for (i = 0; i < num_entries; i++) {
+ struct htab_elem *elem;
+
+ elem = get_htab_elem(htab, i);
+ bpf_map_free_kptrs(&htab->map, elem->key + round_up(htab->map.key_size, 8));
+ cond_resched();
+ }
+}
+
static void htab_free_elems(struct bpf_htab *htab)
{
int i;
@@ -725,12 +745,15 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map,
return insn - insn_buf;
}
-static void check_and_free_timer(struct bpf_htab *htab, struct htab_elem *elem)
+static void check_and_free_fields(struct bpf_htab *htab,
+ struct htab_elem *elem)
{
- if (unlikely(map_value_has_timer(&htab->map)))
- bpf_timer_cancel_and_free(elem->key +
- round_up(htab->map.key_size, 8) +
- htab->map.timer_off);
+ void *map_value = elem->key + round_up(htab->map.key_size, 8);
+
+ if (map_value_has_timer(&htab->map))
+ bpf_timer_cancel_and_free(map_value + htab->map.timer_off);
+ if (map_value_has_kptrs(&htab->map))
+ bpf_map_free_kptrs(&htab->map, map_value);
}
/* It is called from the bpf_lru_list when the LRU needs to delete
@@ -738,7 +761,7 @@ static void check_and_free_timer(struct bpf_htab *htab, struct htab_elem *elem)
*/
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
{
- struct bpf_htab *htab = (struct bpf_htab *)arg;
+ struct bpf_htab *htab = arg;
struct htab_elem *l = NULL, *tgt_l;
struct hlist_nulls_head *head;
struct hlist_nulls_node *n;
@@ -757,7 +780,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
if (l == tgt_l) {
hlist_nulls_del_rcu(&l->hash_node);
- check_and_free_timer(htab, l);
+ check_and_free_fields(htab, l);
break;
}
@@ -829,7 +852,7 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
{
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
- check_and_free_timer(htab, l);
+ check_and_free_fields(htab, l);
kfree(l);
}
@@ -857,7 +880,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
htab_put_fd_value(htab, l);
if (htab_is_prealloc(htab)) {
- check_and_free_timer(htab, l);
+ check_and_free_fields(htab, l);
__pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
atomic_dec(&htab->count);
@@ -1104,7 +1127,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
if (!htab_is_prealloc(htab))
free_htab_elem(htab, l_old);
else
- check_and_free_timer(htab, l_old);
+ check_and_free_fields(htab, l_old);
}
ret = 0;
err:
@@ -1114,7 +1137,7 @@ err:
static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
{
- check_and_free_timer(htab, elem);
+ check_and_free_fields(htab, elem);
bpf_lru_push_free(&htab->lru, &elem->lru_node);
}
@@ -1419,8 +1442,14 @@ static void htab_free_malloced_timers(struct bpf_htab *htab)
struct hlist_nulls_node *n;
struct htab_elem *l;
- hlist_nulls_for_each_entry(l, n, head, hash_node)
- check_and_free_timer(htab, l);
+ hlist_nulls_for_each_entry(l, n, head, hash_node) {
+ /* We don't reset or free kptr on uref dropping to zero,
+ * hence just free timer.
+ */
+ bpf_timer_cancel_and_free(l->key +
+ round_up(htab->map.key_size, 8) +
+ htab->map.timer_off);
+ }
cond_resched_rcu();
}
rcu_read_unlock();
@@ -1430,7 +1459,8 @@ static void htab_map_free_timers(struct bpf_map *map)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- if (likely(!map_value_has_timer(&htab->map)))
+ /* We don't reset or free kptr on uref dropping to zero. */
+ if (!map_value_has_timer(&htab->map))
return;
if (!htab_is_prealloc(htab))
htab_free_malloced_timers(htab);
@@ -1453,11 +1483,14 @@ static void htab_map_free(struct bpf_map *map)
* not have executed. Wait for them.
*/
rcu_barrier();
- if (!htab_is_prealloc(htab))
+ if (!htab_is_prealloc(htab)) {
delete_all_elements(htab);
- else
+ } else {
+ htab_free_prealloced_kptrs(htab);
prealloc_destroy(htab);
+ }
+ bpf_map_free_kptr_off_tab(map);
free_percpu(htab->extra_elems);
bpf_map_area_free(htab->buckets);
for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
@@ -1594,7 +1627,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
void __user *uvalues = u64_to_user_ptr(attr->batch.values);
void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
- u32 batch, max_count, size, bucket_size;
+ u32 batch, max_count, size, bucket_size, map_id;
struct htab_elem *node_to_free = NULL;
u64 elem_map_flags, map_flags;
struct hlist_nulls_head *head;
@@ -1719,6 +1752,14 @@ again_nocopy:
}
} else {
value = l->key + roundup_key_size;
+ if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+ struct bpf_map **inner_map = value;
+
+ /* Actual value is the id of the inner map */
+ map_id = map->ops->map_fd_sys_lookup_elem(*inner_map);
+ value = &map_id;
+ }
+
if (elem_map_flags & BPF_F_LOCK)
copy_map_value_locked(map, dst_val, value,
true);
@@ -2105,7 +2146,7 @@ out:
return num_elems;
}
-static int htab_map_btf_id;
+BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab)
const struct bpf_map_ops htab_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = htab_map_alloc_check,
@@ -2122,12 +2163,10 @@ const struct bpf_map_ops htab_map_ops = {
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
BATCH_OPS(htab),
- .map_btf_name = "bpf_htab",
- .map_btf_id = &htab_map_btf_id,
+ .map_btf_id = &htab_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
};
-static int htab_lru_map_btf_id;
const struct bpf_map_ops htab_lru_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = htab_map_alloc_check,
@@ -2145,8 +2184,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
BATCH_OPS(htab_lru),
- .map_btf_name = "bpf_htab",
- .map_btf_id = &htab_lru_map_btf_id,
+ .map_btf_id = &htab_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
};
@@ -2161,6 +2199,20 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
return NULL;
}
+static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
+{
+ struct htab_elem *l;
+
+ if (cpu >= nr_cpu_ids)
+ return NULL;
+
+ l = __htab_map_lookup_elem(map, key);
+ if (l)
+ return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
+ else
+ return NULL;
+}
+
static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
{
struct htab_elem *l = __htab_map_lookup_elem(map, key);
@@ -2173,6 +2225,22 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
return NULL;
}
+static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
+{
+ struct htab_elem *l;
+
+ if (cpu >= nr_cpu_ids)
+ return NULL;
+
+ l = __htab_map_lookup_elem(map, key);
+ if (l) {
+ bpf_lru_node_set_ref(&l->lru_node);
+ return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
+ }
+
+ return NULL;
+}
+
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
{
struct htab_elem *l;
@@ -2252,7 +2320,6 @@ static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}
-static int htab_percpu_map_btf_id;
const struct bpf_map_ops htab_percpu_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = htab_map_alloc_check,
@@ -2263,16 +2330,15 @@ const struct bpf_map_ops htab_percpu_map_ops = {
.map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
.map_update_elem = htab_percpu_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
+ .map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
BATCH_OPS(htab_percpu),
- .map_btf_name = "bpf_htab",
- .map_btf_id = &htab_percpu_map_btf_id,
+ .map_btf_id = &htab_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
};
-static int htab_lru_percpu_map_btf_id;
const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = htab_map_alloc_check,
@@ -2283,12 +2349,12 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
.map_update_elem = htab_lru_percpu_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
+ .map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
BATCH_OPS(htab_lru_percpu),
- .map_btf_name = "bpf_htab",
- .map_btf_id = &htab_lru_percpu_map_btf_id,
+ .map_btf_id = &htab_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
};
@@ -2412,7 +2478,6 @@ static void htab_of_map_free(struct bpf_map *map)
fd_htab_map_free(map);
}
-static int htab_of_maps_map_btf_id;
const struct bpf_map_ops htab_of_maps_map_ops = {
.map_alloc_check = fd_htab_map_alloc_check,
.map_alloc = htab_of_map_alloc,
@@ -2425,6 +2490,6 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
.map_gen_lookup = htab_of_map_gen_lookup,
.map_check_btf = map_check_no_btf,
- .map_btf_name = "bpf_htab",
- .map_btf_id = &htab_of_maps_map_btf_id,
+ BATCH_OPS(htab),
+ .map_btf_id = &htab_map_btf_ids[0],
};
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 315053ef6a75..225806a02efb 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -103,7 +103,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE,
+ .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT,
};
BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value)
@@ -116,7 +116,23 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE,
+ .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT,
+};
+
+BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+ return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu);
+}
+
+const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto = {
+ .func = bpf_map_lookup_percpu_elem,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+ .arg3_type = ARG_ANYTHING,
};
const struct bpf_func_proto bpf_get_prandom_u32_proto = {
@@ -1374,6 +1390,191 @@ out:
kfree(t);
}
+BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
+{
+ unsigned long *kptr = map_value;
+
+ return xchg(kptr, (unsigned long)ptr);
+}
+
+/* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg()
+ * helper is determined dynamically by the verifier.
+ */
+#define BPF_PTR_POISON ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA))
+
+const struct bpf_func_proto bpf_kptr_xchg_proto = {
+ .func = bpf_kptr_xchg,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .ret_btf_id = BPF_PTR_POISON,
+ .arg1_type = ARG_PTR_TO_KPTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE,
+ .arg2_btf_id = BPF_PTR_POISON,
+};
+
+/* Since the upper 8 bits of dynptr->size is reserved, the
+ * maximum supported size is 2^24 - 1.
+ */
+#define DYNPTR_MAX_SIZE ((1UL << 24) - 1)
+#define DYNPTR_TYPE_SHIFT 28
+#define DYNPTR_SIZE_MASK 0xFFFFFF
+#define DYNPTR_RDONLY_BIT BIT(31)
+
+static bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr)
+{
+ return ptr->size & DYNPTR_RDONLY_BIT;
+}
+
+static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type)
+{
+ ptr->size |= type << DYNPTR_TYPE_SHIFT;
+}
+
+static u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr)
+{
+ return ptr->size & DYNPTR_SIZE_MASK;
+}
+
+int bpf_dynptr_check_size(u32 size)
+{
+ return size > DYNPTR_MAX_SIZE ? -E2BIG : 0;
+}
+
+void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
+ enum bpf_dynptr_type type, u32 offset, u32 size)
+{
+ ptr->data = data;
+ ptr->offset = offset;
+ ptr->size = size;
+ bpf_dynptr_set_type(ptr, type);
+}
+
+void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
+{
+ memset(ptr, 0, sizeof(*ptr));
+}
+
+static int bpf_dynptr_check_off_len(struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
+{
+ u32 size = bpf_dynptr_get_size(ptr);
+
+ if (len > size || offset > size - len)
+ return -E2BIG;
+
+ return 0;
+}
+
+BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr)
+{
+ int err;
+
+ err = bpf_dynptr_check_size(size);
+ if (err)
+ goto error;
+
+ /* flags is currently unsupported */
+ if (flags) {
+ err = -EINVAL;
+ goto error;
+ }
+
+ bpf_dynptr_init(ptr, data, BPF_DYNPTR_TYPE_LOCAL, 0, size);
+
+ return 0;
+
+error:
+ bpf_dynptr_set_null(ptr);
+ return err;
+}
+
+const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
+ .func = bpf_dynptr_from_mem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT,
+};
+
+BPF_CALL_4(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src, u32, offset)
+{
+ int err;
+
+ if (!src->data)
+ return -EINVAL;
+
+ err = bpf_dynptr_check_off_len(src, offset, len);
+ if (err)
+ return err;
+
+ memcpy(dst, src->data + src->offset + offset, len);
+
+ return 0;
+}
+
+const struct bpf_func_proto bpf_dynptr_read_proto = {
+ .func = bpf_dynptr_read,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_PTR_TO_DYNPTR,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src, u32, len)
+{
+ int err;
+
+ if (!dst->data || bpf_dynptr_is_rdonly(dst))
+ return -EINVAL;
+
+ err = bpf_dynptr_check_off_len(dst, offset, len);
+ if (err)
+ return err;
+
+ memcpy(dst->data + dst->offset + offset, src, len);
+
+ return 0;
+}
+
+const struct bpf_func_proto bpf_dynptr_write_proto = {
+ .func = bpf_dynptr_write,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_DYNPTR,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg4_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
+{
+ int err;
+
+ if (!ptr->data)
+ return 0;
+
+ err = bpf_dynptr_check_off_len(ptr, offset, len);
+ if (err)
+ return 0;
+
+ if (bpf_dynptr_is_rdonly(ptr))
+ return 0;
+
+ return (unsigned long)(ptr->data + ptr->offset + offset);
+}
+
+const struct bpf_func_proto bpf_dynptr_data_proto = {
+ .func = bpf_dynptr_data,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL,
+ .arg1_type = ARG_PTR_TO_DYNPTR,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO,
+};
+
const struct bpf_func_proto bpf_get_current_task_proto __weak;
const struct bpf_func_proto bpf_get_current_task_btf_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_proto __weak;
@@ -1398,6 +1599,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_map_pop_elem_proto;
case BPF_FUNC_map_peek_elem:
return &bpf_map_peek_elem_proto;
+ case BPF_FUNC_map_lookup_percpu_elem:
+ return &bpf_map_lookup_percpu_elem_proto;
case BPF_FUNC_get_prandom_u32:
return &bpf_get_prandom_u32_proto;
case BPF_FUNC_get_smp_processor_id:
@@ -1420,12 +1623,26 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ringbuf_discard_proto;
case BPF_FUNC_ringbuf_query:
return &bpf_ringbuf_query_proto;
+ case BPF_FUNC_ringbuf_reserve_dynptr:
+ return &bpf_ringbuf_reserve_dynptr_proto;
+ case BPF_FUNC_ringbuf_submit_dynptr:
+ return &bpf_ringbuf_submit_dynptr_proto;
+ case BPF_FUNC_ringbuf_discard_dynptr:
+ return &bpf_ringbuf_discard_dynptr_proto;
case BPF_FUNC_for_each_map_elem:
return &bpf_for_each_map_elem_proto;
case BPF_FUNC_loop:
return &bpf_loop_proto;
case BPF_FUNC_strncmp:
return &bpf_strncmp_proto;
+ case BPF_FUNC_dynptr_from_mem:
+ return &bpf_dynptr_from_mem_proto;
+ case BPF_FUNC_dynptr_read:
+ return &bpf_dynptr_read_proto;
+ case BPF_FUNC_dynptr_write:
+ return &bpf_dynptr_write_proto;
+ case BPF_FUNC_dynptr_data:
+ return &bpf_dynptr_data_proto;
default:
break;
}
@@ -1452,6 +1669,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_timer_start_proto;
case BPF_FUNC_timer_cancel:
return &bpf_timer_cancel_proto;
+ case BPF_FUNC_kptr_xchg:
+ return &bpf_kptr_xchg_proto;
default:
break;
}
diff --git a/kernel/bpf/link_iter.c b/kernel/bpf/link_iter.c
new file mode 100644
index 000000000000..fec8005a121c
--- /dev/null
+++ b/kernel/bpf/link_iter.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Red Hat, Inc. */
+#include <linux/bpf.h>
+#include <linux/fs.h>
+#include <linux/filter.h>
+#include <linux/kernel.h>
+#include <linux/btf_ids.h>
+
+struct bpf_iter_seq_link_info {
+ u32 link_id;
+};
+
+static void *bpf_link_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_link_info *info = seq->private;
+ struct bpf_link *link;
+
+ link = bpf_link_get_curr_or_next(&info->link_id);
+ if (!link)
+ return NULL;
+
+ if (*pos == 0)
+ ++*pos;
+ return link;
+}
+
+static void *bpf_link_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_link_info *info = seq->private;
+
+ ++*pos;
+ ++info->link_id;
+ bpf_link_put((struct bpf_link *)v);
+ return bpf_link_get_curr_or_next(&info->link_id);
+}
+
+struct bpf_iter__bpf_link {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct bpf_link *, link);
+};
+
+DEFINE_BPF_ITER_FUNC(bpf_link, struct bpf_iter_meta *meta, struct bpf_link *link)
+
+static int __bpf_link_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+ struct bpf_iter__bpf_link ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ ctx.meta = &meta;
+ ctx.link = v;
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (prog)
+ ret = bpf_iter_run_prog(prog, &ctx);
+
+ return ret;
+}
+
+static int bpf_link_seq_show(struct seq_file *seq, void *v)
+{
+ return __bpf_link_seq_show(seq, v, false);
+}
+
+static void bpf_link_seq_stop(struct seq_file *seq, void *v)
+{
+ if (!v)
+ (void)__bpf_link_seq_show(seq, v, true);
+ else
+ bpf_link_put((struct bpf_link *)v);
+}
+
+static const struct seq_operations bpf_link_seq_ops = {
+ .start = bpf_link_seq_start,
+ .next = bpf_link_seq_next,
+ .stop = bpf_link_seq_stop,
+ .show = bpf_link_seq_show,
+};
+
+BTF_ID_LIST(btf_bpf_link_id)
+BTF_ID(struct, bpf_link)
+
+static const struct bpf_iter_seq_info bpf_link_seq_info = {
+ .seq_ops = &bpf_link_seq_ops,
+ .init_seq_private = NULL,
+ .fini_seq_private = NULL,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_link_info),
+};
+
+static struct bpf_iter_reg bpf_link_reg_info = {
+ .target = "bpf_link",
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__bpf_link, link),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &bpf_link_seq_info,
+};
+
+static int __init bpf_link_iter_init(void)
+{
+ bpf_link_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_link_id;
+ return bpf_iter_reg_target(&bpf_link_reg_info);
+}
+
+late_initcall(bpf_link_iter_init);
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 497916060ac7..8654fc97f5fe 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -9,6 +9,7 @@
#include <linux/rbtree.h>
#include <linux/slab.h>
#include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
#ifdef CONFIG_CGROUP_BPF
@@ -446,7 +447,8 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}
-static int cgroup_storage_map_btf_id;
+BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct,
+ bpf_cgroup_storage_map)
const struct bpf_map_ops cgroup_storage_map_ops = {
.map_alloc = cgroup_storage_map_alloc,
.map_free = cgroup_storage_map_free,
@@ -456,8 +458,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = {
.map_delete_elem = cgroup_storage_delete_elem,
.map_check_btf = cgroup_storage_check_btf,
.map_seq_show_elem = cgroup_storage_seq_show_elem,
- .map_btf_name = "bpf_cgroup_storage_map",
- .map_btf_id = &cgroup_storage_map_btf_id,
+ .map_btf_id = &cgroup_storage_map_btf_ids[0],
};
int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map)
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 5763cc7ac4f1..f0d05a3cc4b9 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -14,6 +14,7 @@
#include <linux/vmalloc.h>
#include <net/ipv6.h>
#include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
/* Intermediate node */
#define LPM_TREE_NODE_FLAG_IM BIT(0)
@@ -719,7 +720,7 @@ static int trie_check_btf(const struct bpf_map *map,
-EINVAL : 0;
}
-static int trie_map_btf_id;
+BTF_ID_LIST_SINGLE(trie_map_btf_ids, struct, lpm_trie)
const struct bpf_map_ops trie_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = trie_alloc,
@@ -732,6 +733,5 @@ const struct bpf_map_ops trie_map_ops = {
.map_update_batch = generic_map_update_batch,
.map_delete_batch = generic_map_delete_batch,
.map_check_btf = trie_check_btf,
- .map_btf_name = "lpm_trie",
- .map_btf_id = &trie_map_btf_id,
+ .map_btf_id = &trie_map_btf_ids[0],
};
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 5cd8f5277279..135205d0d560 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -52,6 +52,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
inner_map_meta->max_entries = inner_map->max_entries;
inner_map_meta->spin_lock_off = inner_map->spin_lock_off;
inner_map_meta->timer_off = inner_map->timer_off;
+ inner_map_meta->kptr_off_tab = bpf_map_copy_kptr_off_tab(inner_map);
if (inner_map->btf) {
btf_get(inner_map->btf);
inner_map_meta->btf = inner_map->btf;
@@ -71,6 +72,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
void bpf_map_meta_free(struct bpf_map *map_meta)
{
+ bpf_map_free_kptr_off_tab(map_meta);
btf_put(map_meta->btf);
kfree(map_meta);
}
@@ -83,7 +85,8 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
meta0->key_size == meta1->key_size &&
meta0->value_size == meta1->value_size &&
meta0->timer_off == meta1->timer_off &&
- meta0->map_flags == meta1->map_flags;
+ meta0->map_flags == meta1->map_flags &&
+ bpf_map_equal_kptr_off_tab(meta0, meta1);
}
void *bpf_map_fd_get_ptr(struct bpf_map *map,
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index f9c734aaa990..a1c0794ae49d 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -8,6 +8,7 @@
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/capability.h>
+#include <linux/btf_ids.h>
#include "percpu_freelist.h"
#define QUEUE_STACK_CREATE_FLAG_MASK \
@@ -247,7 +248,7 @@ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key,
return -EINVAL;
}
-static int queue_map_btf_id;
+BTF_ID_LIST_SINGLE(queue_map_btf_ids, struct, bpf_queue_stack)
const struct bpf_map_ops queue_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = queue_stack_map_alloc_check,
@@ -260,11 +261,9 @@ const struct bpf_map_ops queue_map_ops = {
.map_pop_elem = queue_map_pop_elem,
.map_peek_elem = queue_map_peek_elem,
.map_get_next_key = queue_stack_map_get_next_key,
- .map_btf_name = "bpf_queue_stack",
- .map_btf_id = &queue_map_btf_id,
+ .map_btf_id = &queue_map_btf_ids[0],
};
-static int stack_map_btf_id;
const struct bpf_map_ops stack_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = queue_stack_map_alloc_check,
@@ -277,6 +276,5 @@ const struct bpf_map_ops stack_map_ops = {
.map_pop_elem = stack_map_pop_elem,
.map_peek_elem = stack_map_peek_elem,
.map_get_next_key = queue_stack_map_get_next_key,
- .map_btf_name = "bpf_queue_stack",
- .map_btf_id = &stack_map_btf_id,
+ .map_btf_id = &queue_map_btf_ids[0],
};
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 8251243022a2..e2618fb5870e 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -6,6 +6,7 @@
#include <linux/err.h>
#include <linux/sock_diag.h>
#include <net/sock_reuseport.h>
+#include <linux/btf_ids.h>
struct reuseport_array {
struct bpf_map map;
@@ -337,7 +338,7 @@ static int reuseport_array_get_next_key(struct bpf_map *map, void *key,
return 0;
}
-static int reuseport_array_map_btf_id;
+BTF_ID_LIST_SINGLE(reuseport_array_map_btf_ids, struct, reuseport_array)
const struct bpf_map_ops reuseport_array_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = reuseport_array_alloc_check,
@@ -346,6 +347,5 @@ const struct bpf_map_ops reuseport_array_ops = {
.map_lookup_elem = reuseport_array_lookup_elem,
.map_get_next_key = reuseport_array_get_next_key,
.map_delete_elem = reuseport_array_delete_elem,
- .map_btf_name = "reuseport_array",
- .map_btf_id = &reuseport_array_map_btf_id,
+ .map_btf_id = &reuseport_array_map_btf_ids[0],
};
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 710ba9de12ce..ded4faeca192 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -10,6 +10,7 @@
#include <linux/poll.h>
#include <linux/kmemleak.h>
#include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE)
@@ -263,7 +264,7 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
return 0;
}
-static int ringbuf_map_btf_id;
+BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
const struct bpf_map_ops ringbuf_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = ringbuf_map_alloc,
@@ -274,8 +275,7 @@ const struct bpf_map_ops ringbuf_map_ops = {
.map_update_elem = ringbuf_map_update_elem,
.map_delete_elem = ringbuf_map_delete_elem,
.map_get_next_key = ringbuf_map_get_next_key,
- .map_btf_name = "bpf_ringbuf_map",
- .map_btf_id = &ringbuf_map_btf_id,
+ .map_btf_id = &ringbuf_map_btf_ids[0],
};
/* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself,
@@ -404,7 +404,7 @@ BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags)
const struct bpf_func_proto bpf_ringbuf_submit_proto = {
.func = bpf_ringbuf_submit,
.ret_type = RET_VOID,
- .arg1_type = ARG_PTR_TO_ALLOC_MEM,
+ .arg1_type = ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE,
.arg2_type = ARG_ANYTHING,
};
@@ -417,7 +417,7 @@ BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags)
const struct bpf_func_proto bpf_ringbuf_discard_proto = {
.func = bpf_ringbuf_discard,
.ret_type = RET_VOID,
- .arg1_type = ARG_PTR_TO_ALLOC_MEM,
+ .arg1_type = ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE,
.arg2_type = ARG_ANYTHING,
};
@@ -475,3 +475,81 @@ const struct bpf_func_proto bpf_ringbuf_query_proto = {
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_ANYTHING,
};
+
+BPF_CALL_4(bpf_ringbuf_reserve_dynptr, struct bpf_map *, map, u32, size, u64, flags,
+ struct bpf_dynptr_kern *, ptr)
+{
+ struct bpf_ringbuf_map *rb_map;
+ void *sample;
+ int err;
+
+ if (unlikely(flags)) {
+ bpf_dynptr_set_null(ptr);
+ return -EINVAL;
+ }
+
+ err = bpf_dynptr_check_size(size);
+ if (err) {
+ bpf_dynptr_set_null(ptr);
+ return err;
+ }
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+
+ sample = __bpf_ringbuf_reserve(rb_map->rb, size);
+ if (!sample) {
+ bpf_dynptr_set_null(ptr);
+ return -EINVAL;
+ }
+
+ bpf_dynptr_init(ptr, sample, BPF_DYNPTR_TYPE_RINGBUF, 0, size);
+
+ return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = {
+ .func = bpf_ringbuf_reserve_dynptr,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT,
+};
+
+BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
+{
+ if (!ptr->data)
+ return 0;
+
+ bpf_ringbuf_commit(ptr->data, flags, false /* discard */);
+
+ bpf_dynptr_set_null(ptr);
+
+ return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto = {
+ .func = bpf_ringbuf_submit_dynptr,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_ringbuf_discard_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
+{
+ if (!ptr->data)
+ return 0;
+
+ bpf_ringbuf_commit(ptr->data, flags, true /* discard */);
+
+ bpf_dynptr_set_null(ptr);
+
+ return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = {
+ .func = bpf_ringbuf_discard_dynptr,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
+ .arg2_type = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 34725bfa1e97..1adbe67cdb95 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -100,13 +100,11 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
return ERR_PTR(-E2BIG);
cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
- cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
if (!smap)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&smap->map, attr);
- smap->map.value_size = value_size;
smap->n_buckets = n_buckets;
err = get_callchain_buffers(sysctl_perf_event_max_stack);
@@ -656,7 +654,7 @@ static void stack_map_free(struct bpf_map *map)
put_callchain_buffers();
}
-static int stack_trace_map_btf_id;
+BTF_ID_LIST_SINGLE(stack_trace_map_btf_ids, struct, bpf_stack_map)
const struct bpf_map_ops stack_trace_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = stack_map_alloc,
@@ -666,6 +664,5 @@ const struct bpf_map_ops stack_trace_map_ops = {
.map_update_elem = stack_map_update_elem,
.map_delete_elem = stack_map_delete_elem,
.map_check_btf = map_check_no_btf,
- .map_btf_name = "bpf_stack_map",
- .map_btf_id = &stack_trace_map_btf_id,
+ .map_btf_id = &stack_trace_map_btf_ids[0],
};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cdaa1152436a..2b69306d3c6e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -6,6 +6,7 @@
#include <linux/bpf_trace.h>
#include <linux/bpf_lirc.h>
#include <linux/bpf_verifier.h>
+#include <linux/bsearch.h>
#include <linux/btf.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
@@ -29,6 +30,7 @@
#include <linux/pgtable.h>
#include <linux/bpf_lsm.h>
#include <linux/poll.h>
+#include <linux/sort.h>
#include <linux/bpf-netns.h>
#include <linux/rcupdate_trace.h>
#include <linux/memcontrol.h>
@@ -473,14 +475,128 @@ static void bpf_map_release_memcg(struct bpf_map *map)
}
#endif
+static int bpf_map_kptr_off_cmp(const void *a, const void *b)
+{
+ const struct bpf_map_value_off_desc *off_desc1 = a, *off_desc2 = b;
+
+ if (off_desc1->offset < off_desc2->offset)
+ return -1;
+ else if (off_desc1->offset > off_desc2->offset)
+ return 1;
+ return 0;
+}
+
+struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset)
+{
+ /* Since members are iterated in btf_find_field in increasing order,
+ * offsets appended to kptr_off_tab are in increasing order, so we can
+ * do bsearch to find exact match.
+ */
+ struct bpf_map_value_off *tab;
+
+ if (!map_value_has_kptrs(map))
+ return NULL;
+ tab = map->kptr_off_tab;
+ return bsearch(&offset, tab->off, tab->nr_off, sizeof(tab->off[0]), bpf_map_kptr_off_cmp);
+}
+
+void bpf_map_free_kptr_off_tab(struct bpf_map *map)
+{
+ struct bpf_map_value_off *tab = map->kptr_off_tab;
+ int i;
+
+ if (!map_value_has_kptrs(map))
+ return;
+ for (i = 0; i < tab->nr_off; i++) {
+ if (tab->off[i].kptr.module)
+ module_put(tab->off[i].kptr.module);
+ btf_put(tab->off[i].kptr.btf);
+ }
+ kfree(tab);
+ map->kptr_off_tab = NULL;
+}
+
+struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map)
+{
+ struct bpf_map_value_off *tab = map->kptr_off_tab, *new_tab;
+ int size, i;
+
+ if (!map_value_has_kptrs(map))
+ return ERR_PTR(-ENOENT);
+ size = offsetof(struct bpf_map_value_off, off[tab->nr_off]);
+ new_tab = kmemdup(tab, size, GFP_KERNEL | __GFP_NOWARN);
+ if (!new_tab)
+ return ERR_PTR(-ENOMEM);
+ /* Do a deep copy of the kptr_off_tab */
+ for (i = 0; i < tab->nr_off; i++) {
+ btf_get(tab->off[i].kptr.btf);
+ if (tab->off[i].kptr.module && !try_module_get(tab->off[i].kptr.module)) {
+ while (i--) {
+ if (tab->off[i].kptr.module)
+ module_put(tab->off[i].kptr.module);
+ btf_put(tab->off[i].kptr.btf);
+ }
+ kfree(new_tab);
+ return ERR_PTR(-ENXIO);
+ }
+ }
+ return new_tab;
+}
+
+bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b)
+{
+ struct bpf_map_value_off *tab_a = map_a->kptr_off_tab, *tab_b = map_b->kptr_off_tab;
+ bool a_has_kptr = map_value_has_kptrs(map_a), b_has_kptr = map_value_has_kptrs(map_b);
+ int size;
+
+ if (!a_has_kptr && !b_has_kptr)
+ return true;
+ if (a_has_kptr != b_has_kptr)
+ return false;
+ if (tab_a->nr_off != tab_b->nr_off)
+ return false;
+ size = offsetof(struct bpf_map_value_off, off[tab_a->nr_off]);
+ return !memcmp(tab_a, tab_b, size);
+}
+
+/* Caller must ensure map_value_has_kptrs is true. Note that this function can
+ * be called on a map value while the map_value is visible to BPF programs, as
+ * it ensures the correct synchronization, and we already enforce the same using
+ * the bpf_kptr_xchg helper on the BPF program side for referenced kptrs.
+ */
+void bpf_map_free_kptrs(struct bpf_map *map, void *map_value)
+{
+ struct bpf_map_value_off *tab = map->kptr_off_tab;
+ unsigned long *btf_id_ptr;
+ int i;
+
+ for (i = 0; i < tab->nr_off; i++) {
+ struct bpf_map_value_off_desc *off_desc = &tab->off[i];
+ unsigned long old_ptr;
+
+ btf_id_ptr = map_value + off_desc->offset;
+ if (off_desc->type == BPF_KPTR_UNREF) {
+ u64 *p = (u64 *)btf_id_ptr;
+
+ WRITE_ONCE(p, 0);
+ continue;
+ }
+ old_ptr = xchg(btf_id_ptr, 0);
+ off_desc->kptr.dtor((void *)old_ptr);
+ }
+}
+
/* called from workqueue */
static void bpf_map_free_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_map, work);
security_bpf_map_free(map);
+ kfree(map->off_arr);
bpf_map_release_memcg(map);
- /* implementation dependent freeing */
+ /* implementation dependent freeing, map_free callback also does
+ * bpf_map_free_kptr_off_tab, if needed.
+ */
map->ops->map_free(map);
}
@@ -640,7 +756,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
int err;
if (!map->ops->map_mmap || map_value_has_spin_lock(map) ||
- map_value_has_timer(map))
+ map_value_has_timer(map) || map_value_has_kptrs(map))
return -ENOTSUPP;
if (!(vma->vm_flags & VM_SHARED))
@@ -767,6 +883,84 @@ int map_check_no_btf(const struct bpf_map *map,
return -ENOTSUPP;
}
+static int map_off_arr_cmp(const void *_a, const void *_b, const void *priv)
+{
+ const u32 a = *(const u32 *)_a;
+ const u32 b = *(const u32 *)_b;
+
+ if (a < b)
+ return -1;
+ else if (a > b)
+ return 1;
+ return 0;
+}
+
+static void map_off_arr_swap(void *_a, void *_b, int size, const void *priv)
+{
+ struct bpf_map *map = (struct bpf_map *)priv;
+ u32 *off_base = map->off_arr->field_off;
+ u32 *a = _a, *b = _b;
+ u8 *sz_a, *sz_b;
+
+ sz_a = map->off_arr->field_sz + (a - off_base);
+ sz_b = map->off_arr->field_sz + (b - off_base);
+
+ swap(*a, *b);
+ swap(*sz_a, *sz_b);
+}
+
+static int bpf_map_alloc_off_arr(struct bpf_map *map)
+{
+ bool has_spin_lock = map_value_has_spin_lock(map);
+ bool has_timer = map_value_has_timer(map);
+ bool has_kptrs = map_value_has_kptrs(map);
+ struct bpf_map_off_arr *off_arr;
+ u32 i;
+
+ if (!has_spin_lock && !has_timer && !has_kptrs) {
+ map->off_arr = NULL;
+ return 0;
+ }
+
+ off_arr = kmalloc(sizeof(*map->off_arr), GFP_KERNEL | __GFP_NOWARN);
+ if (!off_arr)
+ return -ENOMEM;
+ map->off_arr = off_arr;
+
+ off_arr->cnt = 0;
+ if (has_spin_lock) {
+ i = off_arr->cnt;
+
+ off_arr->field_off[i] = map->spin_lock_off;
+ off_arr->field_sz[i] = sizeof(struct bpf_spin_lock);
+ off_arr->cnt++;
+ }
+ if (has_timer) {
+ i = off_arr->cnt;
+
+ off_arr->field_off[i] = map->timer_off;
+ off_arr->field_sz[i] = sizeof(struct bpf_timer);
+ off_arr->cnt++;
+ }
+ if (has_kptrs) {
+ struct bpf_map_value_off *tab = map->kptr_off_tab;
+ u32 *off = &off_arr->field_off[off_arr->cnt];
+ u8 *sz = &off_arr->field_sz[off_arr->cnt];
+
+ for (i = 0; i < tab->nr_off; i++) {
+ *off++ = tab->off[i].offset;
+ *sz++ = sizeof(u64);
+ }
+ off_arr->cnt += tab->nr_off;
+ }
+
+ if (off_arr->cnt == 1)
+ return 0;
+ sort_r(off_arr->field_off, off_arr->cnt, sizeof(off_arr->field_off[0]),
+ map_off_arr_cmp, map_off_arr_swap, map);
+ return 0;
+}
+
static int map_check_btf(struct bpf_map *map, const struct btf *btf,
u32 btf_key_id, u32 btf_value_id)
{
@@ -820,10 +1014,34 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
return -EOPNOTSUPP;
}
- if (map->ops->map_check_btf)
+ map->kptr_off_tab = btf_parse_kptrs(btf, value_type);
+ if (map_value_has_kptrs(map)) {
+ if (!bpf_capable()) {
+ ret = -EPERM;
+ goto free_map_tab;
+ }
+ if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) {
+ ret = -EACCES;
+ goto free_map_tab;
+ }
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ }
+
+ if (map->ops->map_check_btf) {
ret = map->ops->map_check_btf(map, btf, key_type, value_type);
+ if (ret < 0)
+ goto free_map_tab;
+ }
return ret;
+free_map_tab:
+ bpf_map_free_kptr_off_tab(map);
+ return ret;
}
#define BPF_MAP_CREATE_LAST_FIELD map_extra
@@ -912,10 +1130,14 @@ static int map_create(union bpf_attr *attr)
attr->btf_vmlinux_value_type_id;
}
- err = security_bpf_map_alloc(map);
+ err = bpf_map_alloc_off_arr(map);
if (err)
goto free_map;
+ err = security_bpf_map_alloc(map);
+ if (err)
+ goto free_map_off_arr;
+
err = bpf_map_alloc_id(map);
if (err)
goto free_map_sec;
@@ -938,6 +1160,8 @@ static int map_create(union bpf_attr *attr)
free_map_sec:
security_bpf_map_free(map);
+free_map_off_arr:
+ kfree(map->off_arr);
free_map:
btf_put(map->btf);
map->ops->map_free(map);
@@ -1639,7 +1863,7 @@ static int map_freeze(const union bpf_attr *attr)
return PTR_ERR(map);
if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS ||
- map_value_has_timer(map)) {
+ map_value_has_timer(map) || map_value_has_kptrs(map)) {
fdput(f);
return -ENOTSUPP;
}
@@ -2640,19 +2864,12 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd)
}
EXPORT_SYMBOL(bpf_link_get_from_fd);
-struct bpf_tracing_link {
- struct bpf_link link;
- enum bpf_attach_type attach_type;
- struct bpf_trampoline *trampoline;
- struct bpf_prog *tgt_prog;
-};
-
static void bpf_tracing_link_release(struct bpf_link *link)
{
struct bpf_tracing_link *tr_link =
- container_of(link, struct bpf_tracing_link, link);
+ container_of(link, struct bpf_tracing_link, link.link);
- WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog,
+ WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link,
tr_link->trampoline));
bpf_trampoline_put(tr_link->trampoline);
@@ -2665,7 +2882,7 @@ static void bpf_tracing_link_release(struct bpf_link *link)
static void bpf_tracing_link_dealloc(struct bpf_link *link)
{
struct bpf_tracing_link *tr_link =
- container_of(link, struct bpf_tracing_link, link);
+ container_of(link, struct bpf_tracing_link, link.link);
kfree(tr_link);
}
@@ -2674,7 +2891,7 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
struct seq_file *seq)
{
struct bpf_tracing_link *tr_link =
- container_of(link, struct bpf_tracing_link, link);
+ container_of(link, struct bpf_tracing_link, link.link);
seq_printf(seq,
"attach_type:\t%d\n",
@@ -2685,7 +2902,7 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
struct bpf_link_info *info)
{
struct bpf_tracing_link *tr_link =
- container_of(link, struct bpf_tracing_link, link);
+ container_of(link, struct bpf_tracing_link, link.link);
info->tracing.attach_type = tr_link->attach_type;
bpf_trampoline_unpack_key(tr_link->trampoline->key,
@@ -2704,7 +2921,8 @@ static const struct bpf_link_ops bpf_tracing_link_lops = {
static int bpf_tracing_prog_attach(struct bpf_prog *prog,
int tgt_prog_fd,
- u32 btf_id)
+ u32 btf_id,
+ u64 bpf_cookie)
{
struct bpf_link_primer link_primer;
struct bpf_prog *tgt_prog = NULL;
@@ -2766,9 +2984,10 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
err = -ENOMEM;
goto out_put_prog;
}
- bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING,
+ bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING,
&bpf_tracing_link_lops, prog);
link->attach_type = prog->expected_attach_type;
+ link->link.cookie = bpf_cookie;
mutex_lock(&prog->aux->dst_mutex);
@@ -2836,11 +3055,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
tgt_prog = prog->aux->dst_prog;
}
- err = bpf_link_prime(&link->link, &link_primer);
+ err = bpf_link_prime(&link->link.link, &link_primer);
if (err)
goto out_unlock;
- err = bpf_trampoline_link_prog(prog, tr);
+ err = bpf_trampoline_link_prog(&link->link, tr);
if (err) {
bpf_link_cleanup(&link_primer);
link = NULL;
@@ -3030,66 +3249,45 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro
}
#endif /* CONFIG_PERF_EVENTS */
-#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
-
-static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
+static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
+ const char __user *user_tp_name)
{
struct bpf_link_primer link_primer;
struct bpf_raw_tp_link *link;
struct bpf_raw_event_map *btp;
- struct bpf_prog *prog;
const char *tp_name;
char buf[128];
int err;
- if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
- return -EINVAL;
-
- prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
- if (IS_ERR(prog))
- return PTR_ERR(prog);
-
switch (prog->type) {
case BPF_PROG_TYPE_TRACING:
case BPF_PROG_TYPE_EXT:
case BPF_PROG_TYPE_LSM:
- if (attr->raw_tracepoint.name) {
+ if (user_tp_name)
/* The attach point for this category of programs
* should be specified via btf_id during program load.
*/
- err = -EINVAL;
- goto out_put_prog;
- }
+ return -EINVAL;
if (prog->type == BPF_PROG_TYPE_TRACING &&
prog->expected_attach_type == BPF_TRACE_RAW_TP) {
tp_name = prog->aux->attach_func_name;
break;
}
- err = bpf_tracing_prog_attach(prog, 0, 0);
- if (err >= 0)
- return err;
- goto out_put_prog;
+ return bpf_tracing_prog_attach(prog, 0, 0, 0);
case BPF_PROG_TYPE_RAW_TRACEPOINT:
case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
- if (strncpy_from_user(buf,
- u64_to_user_ptr(attr->raw_tracepoint.name),
- sizeof(buf) - 1) < 0) {
- err = -EFAULT;
- goto out_put_prog;
- }
+ if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0)
+ return -EFAULT;
buf[sizeof(buf) - 1] = 0;
tp_name = buf;
break;
default:
- err = -EINVAL;
- goto out_put_prog;
+ return -EINVAL;
}
btp = bpf_get_raw_tracepoint(tp_name);
- if (!btp) {
- err = -ENOENT;
- goto out_put_prog;
- }
+ if (!btp)
+ return -ENOENT;
link = kzalloc(sizeof(*link), GFP_USER);
if (!link) {
@@ -3116,11 +3314,29 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
out_put_btp:
bpf_put_raw_tracepoint(btp);
-out_put_prog:
- bpf_prog_put(prog);
return err;
}
+#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
+
+static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
+{
+ struct bpf_prog *prog;
+ int fd;
+
+ if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
+ return -EINVAL;
+
+ prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name));
+ if (fd < 0)
+ bpf_prog_put(prog);
+ return fd;
+}
+
static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
enum bpf_attach_type attach_type)
{
@@ -3189,7 +3405,13 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
case BPF_CGROUP_SETSOCKOPT:
return BPF_PROG_TYPE_CGROUP_SOCKOPT;
case BPF_TRACE_ITER:
+ case BPF_TRACE_RAW_TP:
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ case BPF_MODIFY_RETURN:
return BPF_PROG_TYPE_TRACING;
+ case BPF_LSM_MAC:
+ return BPF_PROG_TYPE_LSM;
case BPF_SK_LOOKUP:
return BPF_PROG_TYPE_SK_LOOKUP;
case BPF_XDP:
@@ -4246,21 +4468,6 @@ err_put:
return err;
}
-static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
- struct bpf_prog *prog)
-{
- if (attr->link_create.attach_type != prog->expected_attach_type)
- return -EINVAL;
-
- if (prog->expected_attach_type == BPF_TRACE_ITER)
- return bpf_iter_link_attach(attr, uattr, prog);
- else if (prog->type == BPF_PROG_TYPE_EXT)
- return bpf_tracing_prog_attach(prog,
- attr->link_create.target_fd,
- attr->link_create.target_btf_id);
- return -EINVAL;
-}
-
#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies
static int link_create(union bpf_attr *attr, bpfptr_t uattr)
{
@@ -4282,15 +4489,13 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
switch (prog->type) {
case BPF_PROG_TYPE_EXT:
- ret = tracing_bpf_link_attach(attr, uattr, prog);
- goto out;
+ break;
case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_TRACEPOINT:
if (attr->link_create.attach_type != BPF_PERF_EVENT) {
ret = -EINVAL;
goto out;
}
- ptype = prog->type;
break;
case BPF_PROG_TYPE_KPROBE:
if (attr->link_create.attach_type != BPF_PERF_EVENT &&
@@ -4298,7 +4503,6 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
ret = -EINVAL;
goto out;
}
- ptype = prog->type;
break;
default:
ptype = attach_type_to_prog_type(attr->link_create.attach_type);
@@ -4309,7 +4513,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
break;
}
- switch (ptype) {
+ switch (prog->type) {
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
@@ -4319,8 +4523,27 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
ret = cgroup_bpf_link_attach(attr, prog);
break;
+ case BPF_PROG_TYPE_EXT:
+ ret = bpf_tracing_prog_attach(prog,
+ attr->link_create.target_fd,
+ attr->link_create.target_btf_id,
+ attr->link_create.tracing.cookie);
+ break;
+ case BPF_PROG_TYPE_LSM:
case BPF_PROG_TYPE_TRACING:
- ret = tracing_bpf_link_attach(attr, uattr, prog);
+ if (attr->link_create.attach_type != prog->expected_attach_type) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
+ ret = bpf_raw_tp_link_attach(prog, NULL);
+ else if (prog->expected_attach_type == BPF_TRACE_ITER)
+ ret = bpf_iter_link_attach(attr, uattr, prog);
+ else
+ ret = bpf_tracing_prog_attach(prog,
+ attr->link_create.target_fd,
+ attr->link_create.target_btf_id,
+ attr->link_create.tracing.cookie);
break;
case BPF_PROG_TYPE_FLOW_DISSECTOR:
case BPF_PROG_TYPE_SK_LOOKUP:
@@ -4454,6 +4677,25 @@ struct bpf_link *bpf_link_by_id(u32 id)
return link;
}
+struct bpf_link *bpf_link_get_curr_or_next(u32 *id)
+{
+ struct bpf_link *link;
+
+ spin_lock_bh(&link_idr_lock);
+again:
+ link = idr_get_next(&link_idr, id);
+ if (link) {
+ link = bpf_link_inc_not_zero(link);
+ if (IS_ERR(link)) {
+ (*id)++;
+ goto again;
+ }
+ }
+ spin_unlock_bh(&link_idr_lock);
+
+ return link;
+}
+
#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
@@ -4621,9 +4863,21 @@ out_prog_put:
static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
{
union bpf_attr attr;
+ bool capable;
int err;
- if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
+ capable = bpf_capable() || !sysctl_unprivileged_bpf_disabled;
+
+ /* Intent here is for unprivileged_bpf_disabled to block key object
+ * creation commands for unprivileged users; other actions depend
+ * of fd availability and access to bpffs, so are dependent on
+ * object creation success. Capabilities are later verified for
+ * operations such as load and map create, so even with unprivileged
+ * BPF disabled, capability checks are still carried out for these
+ * and other operations.
+ */
+ if (!capable &&
+ (cmd == BPF_MAP_CREATE || cmd == BPF_PROG_LOAD))
return -EPERM;
err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
@@ -4782,6 +5036,7 @@ static bool syscall_prog_is_valid_access(int off, int size,
BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
{
struct bpf_prog * __maybe_unused prog;
+ struct bpf_tramp_run_ctx __maybe_unused run_ctx;
switch (cmd) {
case BPF_MAP_CREATE:
@@ -4809,13 +5064,15 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
return -EINVAL;
}
- if (!__bpf_prog_enter_sleepable(prog)) {
+ run_ctx.bpf_cookie = 0;
+ run_ctx.saved_run_ctx = NULL;
+ if (!__bpf_prog_enter_sleepable(prog, &run_ctx)) {
/* recursion detected */
bpf_prog_put(prog);
return -EBUSY;
}
attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in);
- __bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */);
+ __bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */, &run_ctx);
bpf_prog_put(prog);
return 0;
#endif
@@ -4908,3 +5165,90 @@ const struct bpf_verifier_ops bpf_syscall_verifier_ops = {
const struct bpf_prog_ops bpf_syscall_prog_ops = {
.test_run = bpf_prog_test_run_syscall,
};
+
+#ifdef CONFIG_SYSCTL
+static int bpf_stats_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct static_key *key = (struct static_key *)table->data;
+ static int saved_val;
+ int val, ret;
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = sizeof(val),
+ .mode = table->mode,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ };
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&bpf_stats_enabled_mutex);
+ val = saved_val;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret && val != saved_val) {
+ if (val)
+ static_key_slow_inc(key);
+ else
+ static_key_slow_dec(key);
+ saved_val = val;
+ }
+ mutex_unlock(&bpf_stats_enabled_mutex);
+ return ret;
+}
+
+void __weak unpriv_ebpf_notify(int new_state)
+{
+}
+
+static int bpf_unpriv_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret, unpriv_enable = *(int *)table->data;
+ bool locked_state = unpriv_enable == 1;
+ struct ctl_table tmp = *table;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ tmp.data = &unpriv_enable;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret) {
+ if (locked_state && unpriv_enable != 1)
+ return -EPERM;
+ *(int *)table->data = unpriv_enable;
+ }
+
+ unpriv_ebpf_notify(unpriv_enable);
+
+ return ret;
+}
+
+static struct ctl_table bpf_syscall_table[] = {
+ {
+ .procname = "unprivileged_bpf_disabled",
+ .data = &sysctl_unprivileged_bpf_disabled,
+ .maxlen = sizeof(sysctl_unprivileged_bpf_disabled),
+ .mode = 0644,
+ .proc_handler = bpf_unpriv_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {
+ .procname = "bpf_stats_enabled",
+ .data = &bpf_stats_enabled_key.key,
+ .maxlen = sizeof(bpf_stats_enabled_key),
+ .mode = 0644,
+ .proc_handler = bpf_stats_handler,
+ },
+ { }
+};
+
+static int __init bpf_syscall_sysctl_init(void)
+{
+ register_sysctl_init("kernel", bpf_syscall_table);
+ return 0;
+}
+late_initcall(bpf_syscall_sysctl_init);
+#endif /* CONFIG_SYSCTL */
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index d94696198ef8..8c921799def4 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -99,7 +99,6 @@ static int __task_seq_show(struct seq_file *seq, struct task_struct *task,
if (!prog)
return 0;
- meta.seq = seq;
ctx.meta = &meta;
ctx.task = task;
return bpf_iter_run_prog(prog, &ctx);
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index ada97751ae1b..93c7675f0c9e 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -30,9 +30,12 @@ static DEFINE_MUTEX(trampoline_mutex);
bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
{
enum bpf_attach_type eatype = prog->expected_attach_type;
+ enum bpf_prog_type ptype = prog->type;
- return eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
- eatype == BPF_MODIFY_RETURN;
+ return (ptype == BPF_PROG_TYPE_TRACING &&
+ (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
+ eatype == BPF_MODIFY_RETURN)) ||
+ (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
}
void *bpf_jit_alloc_exec_page(void)
@@ -168,30 +171,30 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
return ret;
}
-static struct bpf_tramp_progs *
+static struct bpf_tramp_links *
bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg)
{
- const struct bpf_prog_aux *aux;
- struct bpf_tramp_progs *tprogs;
- struct bpf_prog **progs;
+ struct bpf_tramp_link *link;
+ struct bpf_tramp_links *tlinks;
+ struct bpf_tramp_link **links;
int kind;
*total = 0;
- tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL);
- if (!tprogs)
+ tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL);
+ if (!tlinks)
return ERR_PTR(-ENOMEM);
for (kind = 0; kind < BPF_TRAMP_MAX; kind++) {
- tprogs[kind].nr_progs = tr->progs_cnt[kind];
+ tlinks[kind].nr_links = tr->progs_cnt[kind];
*total += tr->progs_cnt[kind];
- progs = tprogs[kind].progs;
+ links = tlinks[kind].links;
- hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist) {
- *ip_arg |= aux->prog->call_get_func_ip;
- *progs++ = aux->prog;
+ hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) {
+ *ip_arg |= link->link.prog->call_get_func_ip;
+ *links++ = link;
}
}
- return tprogs;
+ return tlinks;
}
static void __bpf_tramp_image_put_deferred(struct work_struct *work)
@@ -330,14 +333,14 @@ out:
static int bpf_trampoline_update(struct bpf_trampoline *tr)
{
struct bpf_tramp_image *im;
- struct bpf_tramp_progs *tprogs;
+ struct bpf_tramp_links *tlinks;
u32 flags = BPF_TRAMP_F_RESTORE_REGS;
bool ip_arg = false;
int err, total;
- tprogs = bpf_trampoline_get_progs(tr, &total, &ip_arg);
- if (IS_ERR(tprogs))
- return PTR_ERR(tprogs);
+ tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg);
+ if (IS_ERR(tlinks))
+ return PTR_ERR(tlinks);
if (total == 0) {
err = unregister_fentry(tr, tr->cur_image->image);
@@ -353,15 +356,15 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
goto out;
}
- if (tprogs[BPF_TRAMP_FEXIT].nr_progs ||
- tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs)
+ if (tlinks[BPF_TRAMP_FEXIT].nr_links ||
+ tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links)
flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
if (ip_arg)
flags |= BPF_TRAMP_F_IP_ARG;
err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
- &tr->func.model, flags, tprogs,
+ &tr->func.model, flags, tlinks,
tr->func.addr);
if (err < 0)
goto out;
@@ -381,7 +384,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
tr->cur_image = im;
tr->selector++;
out:
- kfree(tprogs);
+ kfree(tlinks);
return err;
}
@@ -407,13 +410,14 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
}
}
-int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
+int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
{
enum bpf_tramp_prog_type kind;
+ struct bpf_tramp_link *link_exiting;
int err = 0;
- int cnt;
+ int cnt = 0, i;
- kind = bpf_attach_type_to_tramp(prog);
+ kind = bpf_attach_type_to_tramp(link->link.prog);
mutex_lock(&tr->mutex);
if (tr->extension_prog) {
/* cannot attach fentry/fexit if extension prog is attached.
@@ -422,32 +426,43 @@ int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
err = -EBUSY;
goto out;
}
- cnt = tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT];
+
+ for (i = 0; i < BPF_TRAMP_MAX; i++)
+ cnt += tr->progs_cnt[i];
+
if (kind == BPF_TRAMP_REPLACE) {
/* Cannot attach extension if fentry/fexit are in use. */
if (cnt) {
err = -EBUSY;
goto out;
}
- tr->extension_prog = prog;
+ tr->extension_prog = link->link.prog;
err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
- prog->bpf_func);
+ link->link.prog->bpf_func);
goto out;
}
- if (cnt >= BPF_MAX_TRAMP_PROGS) {
+ if (cnt >= BPF_MAX_TRAMP_LINKS) {
err = -E2BIG;
goto out;
}
- if (!hlist_unhashed(&prog->aux->tramp_hlist)) {
+ if (!hlist_unhashed(&link->tramp_hlist)) {
+ /* prog already linked */
+ err = -EBUSY;
+ goto out;
+ }
+ hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) {
+ if (link_exiting->link.prog != link->link.prog)
+ continue;
/* prog already linked */
err = -EBUSY;
goto out;
}
- hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]);
+
+ hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]);
tr->progs_cnt[kind]++;
err = bpf_trampoline_update(tr);
if (err) {
- hlist_del_init(&prog->aux->tramp_hlist);
+ hlist_del_init(&link->tramp_hlist);
tr->progs_cnt[kind]--;
}
out:
@@ -456,12 +471,12 @@ out:
}
/* bpf_trampoline_unlink_prog() should never fail. */
-int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
+int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
{
enum bpf_tramp_prog_type kind;
int err;
- kind = bpf_attach_type_to_tramp(prog);
+ kind = bpf_attach_type_to_tramp(link->link.prog);
mutex_lock(&tr->mutex);
if (kind == BPF_TRAMP_REPLACE) {
WARN_ON_ONCE(!tr->extension_prog);
@@ -470,7 +485,7 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
tr->extension_prog = NULL;
goto out;
}
- hlist_del_init(&prog->aux->tramp_hlist);
+ hlist_del_init(&link->tramp_hlist);
tr->progs_cnt[kind]--;
err = bpf_trampoline_update(tr);
out:
@@ -500,16 +515,19 @@ out:
void bpf_trampoline_put(struct bpf_trampoline *tr)
{
+ int i;
+
if (!tr)
return;
mutex_lock(&trampoline_mutex);
if (!refcount_dec_and_test(&tr->refcnt))
goto out;
WARN_ON_ONCE(mutex_is_locked(&tr->mutex));
- if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FENTRY])))
- goto out;
- if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
- goto out;
+
+ for (i = 0; i < BPF_TRAMP_MAX; i++)
+ if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[i])))
+ goto out;
+
/* This code will be executed even when the last bpf_tramp_image
* is alive. All progs are detached from the trampoline and the
* trampoline image is patched with jmp into epilogue to skip
@@ -559,11 +577,14 @@ static void notrace inc_misses_counter(struct bpf_prog *prog)
* [2..MAX_U64] - execute bpf prog and record execution time.
* This is start time.
*/
-u64 notrace __bpf_prog_enter(struct bpf_prog *prog)
+u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
__acquires(RCU)
{
rcu_read_lock();
migrate_disable();
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
inc_misses_counter(prog);
return 0;
@@ -593,29 +614,38 @@ static void notrace update_prog_stats(struct bpf_prog *prog,
}
}
-void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
+void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx)
__releases(RCU)
{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
update_prog_stats(prog, start);
__this_cpu_dec(*(prog->active));
migrate_enable();
rcu_read_unlock();
}
-u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog)
+u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
{
rcu_read_lock_trace();
migrate_disable();
might_fault();
+
if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
inc_misses_counter(prog);
return 0;
}
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
return bpf_prog_start_time();
}
-void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start)
+void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
update_prog_stats(prog, start);
__this_cpu_dec(*(prog->active));
migrate_enable();
@@ -635,7 +665,7 @@ void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr)
int __weak
arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
const struct btf_func_model *m, u32 flags,
- struct bpf_tramp_progs *tprogs,
+ struct bpf_tramp_links *tlinks,
void *orig_call)
{
return -ENOTSUPP;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d175b70067b3..aedac2ac02b9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -187,6 +187,9 @@ struct bpf_verifier_stack_elem {
POISON_POINTER_DELTA))
#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV))
+static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
+static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
+
static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
{
return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON;
@@ -245,6 +248,7 @@ struct bpf_call_arg_meta {
struct bpf_map *map_ptr;
bool raw_mode;
bool pkt_access;
+ u8 release_regno;
int regno;
int access_size;
int mem_size;
@@ -257,6 +261,8 @@ struct bpf_call_arg_meta {
struct btf *ret_btf;
u32 ret_btf_id;
u32 subprogno;
+ struct bpf_map_value_off_desc *kptr_off_desc;
+ u8 uninit_dynptr_regno;
};
struct btf *btf_vmlinux;
@@ -471,17 +477,6 @@ static bool type_may_be_null(u32 type)
return type & PTR_MAYBE_NULL;
}
-/* Determine whether the function releases some resources allocated by another
- * function call. The first reference type argument will be assumed to be
- * released by release_reference().
- */
-static bool is_release_function(enum bpf_func_id func_id)
-{
- return func_id == BPF_FUNC_sk_release ||
- func_id == BPF_FUNC_ringbuf_submit ||
- func_id == BPF_FUNC_ringbuf_discard;
-}
-
static bool may_be_acquire_function(enum bpf_func_id func_id)
{
return func_id == BPF_FUNC_sk_lookup_tcp ||
@@ -499,7 +494,8 @@ static bool is_acquire_function(enum bpf_func_id func_id,
if (func_id == BPF_FUNC_sk_lookup_tcp ||
func_id == BPF_FUNC_sk_lookup_udp ||
func_id == BPF_FUNC_skc_lookup_tcp ||
- func_id == BPF_FUNC_ringbuf_reserve)
+ func_id == BPF_FUNC_ringbuf_reserve ||
+ func_id == BPF_FUNC_kptr_xchg)
return true;
if (func_id == BPF_FUNC_map_lookup_elem &&
@@ -517,6 +513,7 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id)
func_id == BPF_FUNC_skc_to_tcp_sock ||
func_id == BPF_FUNC_skc_to_tcp6_sock ||
func_id == BPF_FUNC_skc_to_udp6_sock ||
+ func_id == BPF_FUNC_skc_to_mptcp_sock ||
func_id == BPF_FUNC_skc_to_tcp_timewait_sock ||
func_id == BPF_FUNC_skc_to_tcp_request_sock;
}
@@ -575,6 +572,8 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
strncpy(prefix, "user_", 32);
if (type & MEM_PERCPU)
strncpy(prefix, "percpu_", 32);
+ if (type & PTR_UNTRUSTED)
+ strncpy(prefix, "untrusted_", 32);
snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",
prefix, str[base_type(type)], postfix);
@@ -586,6 +585,7 @@ static char slot_type_char[] = {
[STACK_SPILL] = 'r',
[STACK_MISC] = 'm',
[STACK_ZERO] = '0',
+ [STACK_DYNPTR] = 'd',
};
static void print_liveness(struct bpf_verifier_env *env,
@@ -601,6 +601,25 @@ static void print_liveness(struct bpf_verifier_env *env,
verbose(env, "D");
}
+static int get_spi(s32 off)
+{
+ return (-off - 1) / BPF_REG_SIZE;
+}
+
+static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots)
+{
+ int allocated_slots = state->allocated_stack / BPF_REG_SIZE;
+
+ /* We need to check that slots between [spi - nr_slots + 1, spi] are
+ * within [0, allocated_stack).
+ *
+ * Please note that the spi grows downwards. For example, a dynptr
+ * takes the size of two stack slots; the first slot will be at
+ * spi and the second slot will be at spi - 1.
+ */
+ return spi - nr_slots + 1 >= 0 && spi < allocated_slots;
+}
+
static struct bpf_func_state *func(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg)
{
@@ -652,6 +671,132 @@ static void mark_verifier_state_scratched(struct bpf_verifier_env *env)
env->scratched_stack_slots = ~0ULL;
}
+static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
+{
+ switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
+ case DYNPTR_TYPE_LOCAL:
+ return BPF_DYNPTR_TYPE_LOCAL;
+ case DYNPTR_TYPE_RINGBUF:
+ return BPF_DYNPTR_TYPE_RINGBUF;
+ default:
+ return BPF_DYNPTR_TYPE_INVALID;
+ }
+}
+
+static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
+{
+ return type == BPF_DYNPTR_TYPE_RINGBUF;
+}
+
+static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ enum bpf_arg_type arg_type, int insn_idx)
+{
+ struct bpf_func_state *state = func(env, reg);
+ enum bpf_dynptr_type type;
+ int spi, i, id;
+
+ spi = get_spi(reg->off);
+
+ if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
+ return -EINVAL;
+
+ for (i = 0; i < BPF_REG_SIZE; i++) {
+ state->stack[spi].slot_type[i] = STACK_DYNPTR;
+ state->stack[spi - 1].slot_type[i] = STACK_DYNPTR;
+ }
+
+ type = arg_to_dynptr_type(arg_type);
+ if (type == BPF_DYNPTR_TYPE_INVALID)
+ return -EINVAL;
+
+ state->stack[spi].spilled_ptr.dynptr.first_slot = true;
+ state->stack[spi].spilled_ptr.dynptr.type = type;
+ state->stack[spi - 1].spilled_ptr.dynptr.type = type;
+
+ if (dynptr_type_refcounted(type)) {
+ /* The id is used to track proper releasing */
+ id = acquire_reference_state(env, insn_idx);
+ if (id < 0)
+ return id;
+
+ state->stack[spi].spilled_ptr.id = id;
+ state->stack[spi - 1].spilled_ptr.id = id;
+ }
+
+ return 0;
+}
+
+static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i;
+
+ spi = get_spi(reg->off);
+
+ if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
+ return -EINVAL;
+
+ for (i = 0; i < BPF_REG_SIZE; i++) {
+ state->stack[spi].slot_type[i] = STACK_INVALID;
+ state->stack[spi - 1].slot_type[i] = STACK_INVALID;
+ }
+
+ /* Invalidate any slices associated with this dynptr */
+ if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
+ release_reference(env, state->stack[spi].spilled_ptr.id);
+ state->stack[spi].spilled_ptr.id = 0;
+ state->stack[spi - 1].spilled_ptr.id = 0;
+ }
+
+ state->stack[spi].spilled_ptr.dynptr.first_slot = false;
+ state->stack[spi].spilled_ptr.dynptr.type = 0;
+ state->stack[spi - 1].spilled_ptr.dynptr.type = 0;
+
+ return 0;
+}
+
+static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi = get_spi(reg->off);
+ int i;
+
+ if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
+ return true;
+
+ for (i = 0; i < BPF_REG_SIZE; i++) {
+ if (state->stack[spi].slot_type[i] == STACK_DYNPTR ||
+ state->stack[spi - 1].slot_type[i] == STACK_DYNPTR)
+ return false;
+ }
+
+ return true;
+}
+
+static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ enum bpf_arg_type arg_type)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi = get_spi(reg->off);
+ int i;
+
+ if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
+ !state->stack[spi].spilled_ptr.dynptr.first_slot)
+ return false;
+
+ for (i = 0; i < BPF_REG_SIZE; i++) {
+ if (state->stack[spi].slot_type[i] != STACK_DYNPTR ||
+ state->stack[spi - 1].slot_type[i] != STACK_DYNPTR)
+ return false;
+ }
+
+ /* ARG_PTR_TO_DYNPTR takes any type of dynptr */
+ if (arg_type == ARG_PTR_TO_DYNPTR)
+ return true;
+
+ return state->stack[spi].spilled_ptr.dynptr.type == arg_to_dynptr_type(arg_type);
+}
+
/* The reg state of a pointer or a bounded scalar was saved when
* it was spilled to the stack.
*/
@@ -1821,8 +1966,7 @@ void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab)
kfree(tab);
}
-static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env,
- u32 func_id, s16 offset)
+static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, s16 offset)
{
if (offset) {
if (offset < 0) {
@@ -1897,7 +2041,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
prog_aux->kfunc_btf_tab = btf_tab;
}
- desc_btf = find_kfunc_desc_btf(env, func_id, offset);
+ desc_btf = find_kfunc_desc_btf(env, offset);
if (IS_ERR(desc_btf)) {
verbose(env, "failed to find BTF for kernel function\n");
return PTR_ERR(desc_btf);
@@ -2366,7 +2510,7 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL)
return NULL;
- desc_btf = find_kfunc_desc_btf(data, insn->imm, insn->off);
+ desc_btf = find_kfunc_desc_btf(data, insn->off);
if (IS_ERR(desc_btf))
return "<error>";
@@ -3211,7 +3355,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
return 0;
}
-enum stack_access_src {
+enum bpf_access_src {
ACCESS_DIRECT = 1, /* the access is performed by an instruction */
ACCESS_HELPER = 2, /* the access is performed by a helper */
};
@@ -3219,7 +3363,7 @@ enum stack_access_src {
static int check_stack_range_initialized(struct bpf_verifier_env *env,
int regno, int off, int access_size,
bool zero_size_allowed,
- enum stack_access_src type,
+ enum bpf_access_src type,
struct bpf_call_arg_meta *meta);
static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
@@ -3469,9 +3613,175 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
return 0;
}
+static int __check_ptr_off_reg(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int regno,
+ bool fixed_off_ok)
+{
+ /* Access to this pointer-typed register or passing it to a helper
+ * is only allowed in its original, unmodified form.
+ */
+
+ if (reg->off < 0) {
+ verbose(env, "negative offset %s ptr R%d off=%d disallowed\n",
+ reg_type_str(env, reg->type), regno, reg->off);
+ return -EACCES;
+ }
+
+ if (!fixed_off_ok && reg->off) {
+ verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n",
+ reg_type_str(env, reg->type), regno, reg->off);
+ return -EACCES;
+ }
+
+ if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "variable %s access var_off=%s disallowed\n",
+ reg_type_str(env, reg->type), tn_buf);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+int check_ptr_off_reg(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int regno)
+{
+ return __check_ptr_off_reg(env, reg, regno, false);
+}
+
+static int map_kptr_match_type(struct bpf_verifier_env *env,
+ struct bpf_map_value_off_desc *off_desc,
+ struct bpf_reg_state *reg, u32 regno)
+{
+ const char *targ_name = kernel_type_name(off_desc->kptr.btf, off_desc->kptr.btf_id);
+ int perm_flags = PTR_MAYBE_NULL;
+ const char *reg_name = "";
+
+ /* Only unreferenced case accepts untrusted pointers */
+ if (off_desc->type == BPF_KPTR_UNREF)
+ perm_flags |= PTR_UNTRUSTED;
+
+ if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags))
+ goto bad_type;
+
+ if (!btf_is_kernel(reg->btf)) {
+ verbose(env, "R%d must point to kernel BTF\n", regno);
+ return -EINVAL;
+ }
+ /* We need to verify reg->type and reg->btf, before accessing reg->btf */
+ reg_name = kernel_type_name(reg->btf, reg->btf_id);
+
+ /* For ref_ptr case, release function check should ensure we get one
+ * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the
+ * normal store of unreferenced kptr, we must ensure var_off is zero.
+ * Since ref_ptr cannot be accessed directly by BPF insns, checks for
+ * reg->off and reg->ref_obj_id are not needed here.
+ */
+ if (__check_ptr_off_reg(env, reg, regno, true))
+ return -EACCES;
+
+ /* A full type match is needed, as BTF can be vmlinux or module BTF, and
+ * we also need to take into account the reg->off.
+ *
+ * We want to support cases like:
+ *
+ * struct foo {
+ * struct bar br;
+ * struct baz bz;
+ * };
+ *
+ * struct foo *v;
+ * v = func(); // PTR_TO_BTF_ID
+ * val->foo = v; // reg->off is zero, btf and btf_id match type
+ * val->bar = &v->br; // reg->off is still zero, but we need to retry with
+ * // first member type of struct after comparison fails
+ * val->baz = &v->bz; // reg->off is non-zero, so struct needs to be walked
+ * // to match type
+ *
+ * In the kptr_ref case, check_func_arg_reg_off already ensures reg->off
+ * is zero. We must also ensure that btf_struct_ids_match does not walk
+ * the struct to match type against first member of struct, i.e. reject
+ * second case from above. Hence, when type is BPF_KPTR_REF, we set
+ * strict mode to true for type match.
+ */
+ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
+ off_desc->kptr.btf, off_desc->kptr.btf_id,
+ off_desc->type == BPF_KPTR_REF))
+ goto bad_type;
+ return 0;
+bad_type:
+ verbose(env, "invalid kptr access, R%d type=%s%s ", regno,
+ reg_type_str(env, reg->type), reg_name);
+ verbose(env, "expected=%s%s", reg_type_str(env, PTR_TO_BTF_ID), targ_name);
+ if (off_desc->type == BPF_KPTR_UNREF)
+ verbose(env, " or %s%s\n", reg_type_str(env, PTR_TO_BTF_ID | PTR_UNTRUSTED),
+ targ_name);
+ else
+ verbose(env, "\n");
+ return -EINVAL;
+}
+
+static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
+ int value_regno, int insn_idx,
+ struct bpf_map_value_off_desc *off_desc)
+{
+ struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
+ int class = BPF_CLASS(insn->code);
+ struct bpf_reg_state *val_reg;
+
+ /* Things we already checked for in check_map_access and caller:
+ * - Reject cases where variable offset may touch kptr
+ * - size of access (must be BPF_DW)
+ * - tnum_is_const(reg->var_off)
+ * - off_desc->offset == off + reg->var_off.value
+ */
+ /* Only BPF_[LDX,STX,ST] | BPF_MEM | BPF_DW is supported */
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose(env, "kptr in map can only be accessed using BPF_MEM instruction mode\n");
+ return -EACCES;
+ }
+
+ /* We only allow loading referenced kptr, since it will be marked as
+ * untrusted, similar to unreferenced kptr.
+ */
+ if (class != BPF_LDX && off_desc->type == BPF_KPTR_REF) {
+ verbose(env, "store to referenced kptr disallowed\n");
+ return -EACCES;
+ }
+
+ if (class == BPF_LDX) {
+ val_reg = reg_state(env, value_regno);
+ /* We can simply mark the value_regno receiving the pointer
+ * value from map as PTR_TO_BTF_ID, with the correct type.
+ */
+ mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, off_desc->kptr.btf,
+ off_desc->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED);
+ /* For mark_ptr_or_null_reg */
+ val_reg->id = ++env->id_gen;
+ } else if (class == BPF_STX) {
+ val_reg = reg_state(env, value_regno);
+ if (!register_is_null(val_reg) &&
+ map_kptr_match_type(env, off_desc, val_reg, value_regno))
+ return -EACCES;
+ } else if (class == BPF_ST) {
+ if (insn->imm) {
+ verbose(env, "BPF_ST imm must be 0 when storing to kptr at off=%u\n",
+ off_desc->offset);
+ return -EACCES;
+ }
+ } else {
+ verbose(env, "kptr in map can only be accessed using BPF_LDX/BPF_STX/BPF_ST\n");
+ return -EACCES;
+ }
+ return 0;
+}
+
/* check read/write into a map element with possible variable offset */
static int check_map_access(struct bpf_verifier_env *env, u32 regno,
- int off, int size, bool zero_size_allowed)
+ int off, int size, bool zero_size_allowed,
+ enum bpf_access_src src)
{
struct bpf_verifier_state *vstate = env->cur_state;
struct bpf_func_state *state = vstate->frame[vstate->curframe];
@@ -3507,6 +3817,36 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
}
}
+ if (map_value_has_kptrs(map)) {
+ struct bpf_map_value_off *tab = map->kptr_off_tab;
+ int i;
+
+ for (i = 0; i < tab->nr_off; i++) {
+ u32 p = tab->off[i].offset;
+
+ if (reg->smin_value + off < p + sizeof(u64) &&
+ p < reg->umax_value + off + size) {
+ if (src != ACCESS_DIRECT) {
+ verbose(env, "kptr cannot be accessed indirectly by helper\n");
+ return -EACCES;
+ }
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "kptr access cannot have variable offset\n");
+ return -EACCES;
+ }
+ if (p != off + reg->var_off.value) {
+ verbose(env, "kptr access misaligned expected=%u off=%llu\n",
+ p, off + reg->var_off.value);
+ return -EACCES;
+ }
+ if (size != bpf_size_to_bytes(BPF_DW)) {
+ verbose(env, "kptr access size must be BPF_DW\n");
+ return -EACCES;
+ }
+ break;
+ }
+ }
+ }
return err;
}
@@ -3980,44 +4320,6 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
}
#endif
-static int __check_ptr_off_reg(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg, int regno,
- bool fixed_off_ok)
-{
- /* Access to this pointer-typed register or passing it to a helper
- * is only allowed in its original, unmodified form.
- */
-
- if (reg->off < 0) {
- verbose(env, "negative offset %s ptr R%d off=%d disallowed\n",
- reg_type_str(env, reg->type), regno, reg->off);
- return -EACCES;
- }
-
- if (!fixed_off_ok && reg->off) {
- verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n",
- reg_type_str(env, reg->type), regno, reg->off);
- return -EACCES;
- }
-
- if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
- char tn_buf[48];
-
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "variable %s access var_off=%s disallowed\n",
- reg_type_str(env, reg->type), tn_buf);
- return -EACCES;
- }
-
- return 0;
-}
-
-int check_ptr_off_reg(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg, int regno)
-{
- return __check_ptr_off_reg(env, reg, regno, false);
-}
-
static int __check_buffer_access(struct bpf_verifier_env *env,
const char *buf_info,
const struct bpf_reg_state *reg,
@@ -4224,6 +4526,12 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
if (ret < 0)
return ret;
+ /* If this is an untrusted pointer, all pointers formed by walking it
+ * also inherit the untrusted flag.
+ */
+ if (type_flag(reg->type) & PTR_UNTRUSTED)
+ flag |= PTR_UNTRUSTED;
+
if (atype == BPF_READ && value_regno >= 0)
mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);
@@ -4316,7 +4624,7 @@ static int check_stack_slot_within_bounds(int off,
static int check_stack_access_within_bounds(
struct bpf_verifier_env *env,
int regno, int off, int access_size,
- enum stack_access_src src, enum bpf_access_type type)
+ enum bpf_access_src src, enum bpf_access_type type)
{
struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = regs + regno;
@@ -4412,6 +4720,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_MAP_VALUE) {
+ struct bpf_map_value_off_desc *kptr_off_desc = NULL;
+
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose(env, "R%d leaks addr into map\n", value_regno);
@@ -4420,8 +4730,16 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
err = check_map_access_type(env, regno, off, size, t);
if (err)
return err;
- err = check_map_access(env, regno, off, size, false);
- if (!err && t == BPF_READ && value_regno >= 0) {
+ err = check_map_access(env, regno, off, size, false, ACCESS_DIRECT);
+ if (err)
+ return err;
+ if (tnum_is_const(reg->var_off))
+ kptr_off_desc = bpf_map_kptr_off_contains(reg->map_ptr,
+ off + reg->var_off.value);
+ if (kptr_off_desc) {
+ err = check_map_kptr_access(env, regno, value_regno, insn_idx,
+ kptr_off_desc);
+ } else if (t == BPF_READ && value_regno >= 0) {
struct bpf_map *map = reg->map_ptr;
/* if map is read-only, track its contents as scalars */
@@ -4724,7 +5042,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
static int check_stack_range_initialized(
struct bpf_verifier_env *env, int regno, int off,
int access_size, bool zero_size_allowed,
- enum stack_access_src type, struct bpf_call_arg_meta *meta)
+ enum bpf_access_src type, struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *reg = reg_state(env, regno);
struct bpf_func_state *state = func(env, reg);
@@ -4861,6 +5179,11 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
return check_packet_access(env, regno, reg->off, access_size,
zero_size_allowed);
case PTR_TO_MAP_KEY:
+ if (meta && meta->raw_mode) {
+ verbose(env, "R%d cannot write into %s\n", regno,
+ reg_type_str(env, reg->type));
+ return -EACCES;
+ }
return check_mem_region_access(env, regno, reg->off, access_size,
reg->map_ptr->key_size, false);
case PTR_TO_MAP_VALUE:
@@ -4869,15 +5192,25 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
BPF_READ))
return -EACCES;
return check_map_access(env, regno, reg->off, access_size,
- zero_size_allowed);
+ zero_size_allowed, ACCESS_HELPER);
case PTR_TO_MEM:
+ if (type_is_rdonly_mem(reg->type)) {
+ if (meta && meta->raw_mode) {
+ verbose(env, "R%d cannot write into %s\n", regno,
+ reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+ }
return check_mem_region_access(env, regno, reg->off,
access_size, reg->mem_size,
zero_size_allowed);
case PTR_TO_BUF:
if (type_is_rdonly_mem(reg->type)) {
- if (meta && meta->raw_mode)
+ if (meta && meta->raw_mode) {
+ verbose(env, "R%d cannot write into %s\n", regno,
+ reg_type_str(env, reg->type));
return -EACCES;
+ }
max_access = &env->prog->aux->max_rdonly_access;
} else {
@@ -4919,8 +5252,7 @@ static int check_mem_size_reg(struct bpf_verifier_env *env,
* out. Only upper bounds can be learned because retval is an
* int type and negative retvals are allowed.
*/
- if (meta)
- meta->msize_max_value = reg->umax_value;
+ meta->msize_max_value = reg->umax_value;
/* The register is SCALAR_VALUE; the access check
* happens using its boundaries.
@@ -4963,24 +5295,33 @@ static int check_mem_size_reg(struct bpf_verifier_env *env,
int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
u32 regno, u32 mem_size)
{
+ bool may_be_null = type_may_be_null(reg->type);
+ struct bpf_reg_state saved_reg;
+ struct bpf_call_arg_meta meta;
+ int err;
+
if (register_is_null(reg))
return 0;
- if (type_may_be_null(reg->type)) {
- /* Assuming that the register contains a value check if the memory
- * access is safe. Temporarily save and restore the register's state as
- * the conversion shouldn't be visible to a caller.
- */
- const struct bpf_reg_state saved_reg = *reg;
- int rv;
-
+ memset(&meta, 0, sizeof(meta));
+ /* Assuming that the register contains a value check if the memory
+ * access is safe. Temporarily save and restore the register's state as
+ * the conversion shouldn't be visible to a caller.
+ */
+ if (may_be_null) {
+ saved_reg = *reg;
mark_ptr_not_null_reg(reg);
- rv = check_helper_mem_access(env, regno, mem_size, true, NULL);
- *reg = saved_reg;
- return rv;
}
- return check_helper_mem_access(env, regno, mem_size, true, NULL);
+ err = check_helper_mem_access(env, regno, mem_size, true, &meta);
+ /* Check access for BPF_WRITE */
+ meta.raw_mode = true;
+ err = err ?: check_helper_mem_access(env, regno, mem_size, true, &meta);
+
+ if (may_be_null)
+ *reg = saved_reg;
+
+ return err;
}
int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
@@ -4989,16 +5330,22 @@ int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state
struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1];
bool may_be_null = type_may_be_null(mem_reg->type);
struct bpf_reg_state saved_reg;
+ struct bpf_call_arg_meta meta;
int err;
WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5);
+ memset(&meta, 0, sizeof(meta));
+
if (may_be_null) {
saved_reg = *mem_reg;
mark_ptr_not_null_reg(mem_reg);
}
- err = check_mem_size_reg(env, reg, regno, true, NULL);
+ err = check_mem_size_reg(env, reg, regno, true, &meta);
+ /* Check access for BPF_WRITE */
+ meta.raw_mode = true;
+ err = err ?: check_mem_size_reg(env, reg, regno, true, &meta);
if (may_be_null)
*mem_reg = saved_reg;
@@ -5134,10 +5481,51 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,
return 0;
}
-static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
+static int process_kptr_func(struct bpf_verifier_env *env, int regno,
+ struct bpf_call_arg_meta *meta)
{
- return base_type(type) == ARG_PTR_TO_MEM ||
- base_type(type) == ARG_PTR_TO_UNINIT_MEM;
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_map_value_off_desc *off_desc;
+ struct bpf_map *map_ptr = reg->map_ptr;
+ u32 kptr_off;
+ int ret;
+
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env,
+ "R%d doesn't have constant offset. kptr has to be at the constant offset\n",
+ regno);
+ return -EINVAL;
+ }
+ if (!map_ptr->btf) {
+ verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n",
+ map_ptr->name);
+ return -EINVAL;
+ }
+ if (!map_value_has_kptrs(map_ptr)) {
+ ret = PTR_ERR_OR_ZERO(map_ptr->kptr_off_tab);
+ if (ret == -E2BIG)
+ verbose(env, "map '%s' has more than %d kptr\n", map_ptr->name,
+ BPF_MAP_VALUE_OFF_MAX);
+ else if (ret == -EEXIST)
+ verbose(env, "map '%s' has repeating kptr BTF tags\n", map_ptr->name);
+ else
+ verbose(env, "map '%s' has no valid kptr\n", map_ptr->name);
+ return -EINVAL;
+ }
+
+ meta->map_ptr = map_ptr;
+ kptr_off = reg->off + reg->var_off.value;
+ off_desc = bpf_map_kptr_off_contains(map_ptr, kptr_off);
+ if (!off_desc) {
+ verbose(env, "off=%d doesn't point to kptr\n", kptr_off);
+ return -EACCES;
+ }
+ if (off_desc->type != BPF_KPTR_REF) {
+ verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off);
+ return -EACCES;
+ }
+ meta->kptr_off_desc = off_desc;
+ return 0;
}
static bool arg_type_is_mem_size(enum bpf_arg_type type)
@@ -5157,6 +5545,16 @@ static bool arg_type_is_int_ptr(enum bpf_arg_type type)
type == ARG_PTR_TO_LONG;
}
+static bool arg_type_is_release(enum bpf_arg_type type)
+{
+ return type & OBJ_RELEASE;
+}
+
+static bool arg_type_is_dynptr(enum bpf_arg_type type)
+{
+ return base_type(type) == ARG_PTR_TO_DYNPTR;
+}
+
static int int_ptr_type_to_size(enum bpf_arg_type type)
{
if (type == ARG_PTR_TO_INT)
@@ -5269,11 +5667,11 @@ static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_MAP_KEY] = &map_key_value_types,
[ARG_PTR_TO_MAP_VALUE] = &map_key_value_types,
- [ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types,
[ARG_CONST_SIZE] = &scalar_types,
[ARG_CONST_SIZE_OR_ZERO] = &scalar_types,
[ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types,
@@ -5287,7 +5685,6 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_BTF_ID] = &btf_ptr_types,
[ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types,
[ARG_PTR_TO_MEM] = &mem_types,
- [ARG_PTR_TO_UNINIT_MEM] = &mem_types,
[ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types,
[ARG_PTR_TO_INT] = &int_ptr_types,
[ARG_PTR_TO_LONG] = &int_ptr_types,
@@ -5296,11 +5693,14 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_STACK] = &stack_ptr_types,
[ARG_PTR_TO_CONST_STR] = &const_str_ptr_types,
[ARG_PTR_TO_TIMER] = &timer_types,
+ [ARG_PTR_TO_KPTR] = &kptr_types,
+ [ARG_PTR_TO_DYNPTR] = &stack_ptr_types,
};
static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
enum bpf_arg_type arg_type,
- const u32 *arg_btf_id)
+ const u32 *arg_btf_id,
+ struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
enum bpf_reg_type expected, type = reg->type;
@@ -5345,6 +5745,13 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
found:
if (reg->type == PTR_TO_BTF_ID) {
+ /* For bpf_sk_release, it needs to match against first member
+ * 'struct sock_common', hence make an exception for it. This
+ * allows bpf_sk_release to work for multiple socket types.
+ */
+ bool strict_type_match = arg_type_is_release(arg_type) &&
+ meta->func_id != BPF_FUNC_sk_release;
+
if (!arg_btf_id) {
if (!compatible->btf_id) {
verbose(env, "verifier internal error: missing arg compatible BTF ID\n");
@@ -5353,8 +5760,12 @@ found:
arg_btf_id = compatible->btf_id;
}
- if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
- btf_vmlinux, *arg_btf_id)) {
+ if (meta->func_id == BPF_FUNC_kptr_xchg) {
+ if (map_kptr_match_type(env, meta->kptr_off_desc, reg, regno))
+ return -EACCES;
+ } else if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
+ btf_vmlinux, *arg_btf_id,
+ strict_type_match)) {
verbose(env, "R%d is of type %s but %s is expected\n",
regno, kernel_type_name(reg->btf, reg->btf_id),
kernel_type_name(btf_vmlinux, *arg_btf_id));
@@ -5367,15 +5778,19 @@ found:
int check_func_arg_reg_off(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg, int regno,
- enum bpf_arg_type arg_type,
- bool is_release_func)
+ enum bpf_arg_type arg_type)
{
- bool fixed_off_ok = false, release_reg;
enum bpf_reg_type type = reg->type;
+ bool fixed_off_ok = false;
switch ((u32)type) {
- case SCALAR_VALUE:
/* Pointer types where reg offset is explicitly allowed: */
+ case PTR_TO_STACK:
+ if (arg_type_is_dynptr(arg_type) && reg->off % BPF_REG_SIZE) {
+ verbose(env, "cannot pass in dynptr at an offset\n");
+ return -EINVAL;
+ }
+ fallthrough;
case PTR_TO_PACKET:
case PTR_TO_PACKET_META:
case PTR_TO_MAP_KEY:
@@ -5385,11 +5800,11 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
case PTR_TO_MEM | MEM_ALLOC:
case PTR_TO_BUF:
case PTR_TO_BUF | MEM_RDONLY:
- case PTR_TO_STACK:
+ case SCALAR_VALUE:
/* Some of the argument types nevertheless require a
* zero register offset.
*/
- if (arg_type != ARG_PTR_TO_ALLOC_MEM)
+ if (base_type(arg_type) != ARG_PTR_TO_ALLOC_MEM)
return 0;
break;
/* All the rest must be rejected, except PTR_TO_BTF_ID which allows
@@ -5397,19 +5812,17 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
*/
case PTR_TO_BTF_ID:
/* When referenced PTR_TO_BTF_ID is passed to release function,
- * it's fixed offset must be 0. We rely on the property that
- * only one referenced register can be passed to BPF helpers and
- * kfuncs. In the other cases, fixed offset can be non-zero.
+ * it's fixed offset must be 0. In the other cases, fixed offset
+ * can be non-zero.
*/
- release_reg = is_release_func && reg->ref_obj_id;
- if (release_reg && reg->off) {
+ if (arg_type_is_release(arg_type) && reg->off) {
verbose(env, "R%d must have zero offset when passed to release func\n",
regno);
return -EINVAL;
}
- /* For release_reg == true, fixed_off_ok must be false, but we
- * already checked and rejected reg->off != 0 above, so set to
- * true to allow fixed offset for all other cases.
+ /* For arg is release pointer, fixed_off_ok must be false, but
+ * we already checked and rejected reg->off != 0 above, so set
+ * to true to allow fixed offset for all other cases.
*/
fixed_off_ok = true;
break;
@@ -5419,6 +5832,14 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
return __check_ptr_off_reg(env, reg, regno, fixed_off_ok);
}
+static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi = get_spi(reg->off);
+
+ return state->stack[spi].spilled_ptr.id;
+}
+
static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
struct bpf_call_arg_meta *meta,
const struct bpf_func_proto *fn)
@@ -5451,8 +5872,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
return -EACCES;
}
- if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE ||
- base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) {
+ if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE) {
err = resolve_map_arg_type(env, meta, &arg_type);
if (err)
return err;
@@ -5464,18 +5884,37 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
*/
goto skip_type_check;
- err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg]);
+ err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg], meta);
if (err)
return err;
- err = check_func_arg_reg_off(env, reg, regno, arg_type, is_release_function(meta->func_id));
+ err = check_func_arg_reg_off(env, reg, regno, arg_type);
if (err)
return err;
skip_type_check:
- /* check_func_arg_reg_off relies on only one referenced register being
- * allowed for BPF helpers.
- */
+ if (arg_type_is_release(arg_type)) {
+ if (arg_type_is_dynptr(arg_type)) {
+ struct bpf_func_state *state = func(env, reg);
+ int spi = get_spi(reg->off);
+
+ if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
+ !state->stack[spi].spilled_ptr.id) {
+ verbose(env, "arg %d is an unacquired reference\n", regno);
+ return -EINVAL;
+ }
+ } else if (!reg->ref_obj_id && !register_is_null(reg)) {
+ verbose(env, "R%d must be referenced when passed to release function\n",
+ regno);
+ return -EINVAL;
+ }
+ if (meta->release_regno) {
+ verbose(env, "verifier internal error: more than one release argument\n");
+ return -EFAULT;
+ }
+ meta->release_regno = regno;
+ }
+
if (reg->ref_obj_id) {
if (meta->ref_obj_id) {
verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
@@ -5528,8 +5967,7 @@ skip_type_check:
err = check_helper_mem_access(env, regno,
meta->map_ptr->key_size, false,
NULL);
- } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE ||
- base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) {
+ } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE) {
if (type_may_be_null(arg_type) && register_is_null(reg))
return 0;
@@ -5541,7 +5979,7 @@ skip_type_check:
verbose(env, "invalid map_ptr to access map->value\n");
return -EACCES;
}
- meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE);
+ meta->raw_mode = arg_type & MEM_UNINIT;
err = check_helper_mem_access(env, regno,
meta->map_ptr->value_size, false,
meta);
@@ -5568,15 +6006,49 @@ skip_type_check:
return -EACCES;
} else if (arg_type == ARG_PTR_TO_FUNC) {
meta->subprogno = reg->subprogno;
- } else if (arg_type_is_mem_ptr(arg_type)) {
+ } else if (base_type(arg_type) == ARG_PTR_TO_MEM) {
/* The access to this pointer is only checked when we hit the
* next is_mem_size argument below.
*/
- meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MEM);
+ meta->raw_mode = arg_type & MEM_UNINIT;
} else if (arg_type_is_mem_size(arg_type)) {
bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
err = check_mem_size_reg(env, reg, regno, zero_size_allowed, meta);
+ } else if (arg_type_is_dynptr(arg_type)) {
+ if (arg_type & MEM_UNINIT) {
+ if (!is_dynptr_reg_valid_uninit(env, reg)) {
+ verbose(env, "Dynptr has to be an uninitialized dynptr\n");
+ return -EINVAL;
+ }
+
+ /* We only support one dynptr being uninitialized at the moment,
+ * which is sufficient for the helper functions we have right now.
+ */
+ if (meta->uninit_dynptr_regno) {
+ verbose(env, "verifier internal error: multiple uninitialized dynptr args\n");
+ return -EFAULT;
+ }
+
+ meta->uninit_dynptr_regno = regno;
+ } else if (!is_dynptr_reg_valid_init(env, reg, arg_type)) {
+ const char *err_extra = "";
+
+ switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
+ case DYNPTR_TYPE_LOCAL:
+ err_extra = "local ";
+ break;
+ case DYNPTR_TYPE_RINGBUF:
+ err_extra = "ringbuf ";
+ break;
+ default:
+ break;
+ }
+
+ verbose(env, "Expected an initialized %sdynptr as arg #%d\n",
+ err_extra, arg + 1);
+ return -EINVAL;
+ }
} else if (arg_type_is_alloc_size(arg_type)) {
if (!tnum_is_const(reg->var_off)) {
verbose(env, "R%d is not a known constant'\n",
@@ -5613,7 +6085,8 @@ skip_type_check:
}
err = check_map_access(env, regno, reg->off,
- map->value_size - reg->off, false);
+ map->value_size - reg->off, false,
+ ACCESS_HELPER);
if (err)
return err;
@@ -5629,6 +6102,9 @@ skip_type_check:
verbose(env, "string is not zero-terminated\n");
return -EINVAL;
}
+ } else if (arg_type == ARG_PTR_TO_KPTR) {
+ if (process_kptr_func(env, regno, meta))
+ return -EACCES;
}
return err;
@@ -5694,7 +6170,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_RINGBUF:
if (func_id != BPF_FUNC_ringbuf_output &&
func_id != BPF_FUNC_ringbuf_reserve &&
- func_id != BPF_FUNC_ringbuf_query)
+ func_id != BPF_FUNC_ringbuf_query &&
+ func_id != BPF_FUNC_ringbuf_reserve_dynptr &&
+ func_id != BPF_FUNC_ringbuf_submit_dynptr &&
+ func_id != BPF_FUNC_ringbuf_discard_dynptr)
goto error;
break;
case BPF_MAP_TYPE_STACK_TRACE:
@@ -5810,6 +6289,9 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_FUNC_ringbuf_output:
case BPF_FUNC_ringbuf_reserve:
case BPF_FUNC_ringbuf_query:
+ case BPF_FUNC_ringbuf_reserve_dynptr:
+ case BPF_FUNC_ringbuf_submit_dynptr:
+ case BPF_FUNC_ringbuf_discard_dynptr:
if (map->map_type != BPF_MAP_TYPE_RINGBUF)
goto error;
break;
@@ -5864,6 +6346,12 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
map->map_type != BPF_MAP_TYPE_BLOOM_FILTER)
goto error;
break;
+ case BPF_FUNC_map_lookup_percpu_elem:
+ if (map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH)
+ goto error;
+ break;
case BPF_FUNC_sk_storage_get:
case BPF_FUNC_sk_storage_delete:
if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
@@ -5915,10 +6403,8 @@ static bool check_raw_mode_ok(const struct bpf_func_proto *fn)
static bool check_args_pair_invalid(enum bpf_arg_type arg_curr,
enum bpf_arg_type arg_next)
{
- return (arg_type_is_mem_ptr(arg_curr) &&
- !arg_type_is_mem_size(arg_next)) ||
- (!arg_type_is_mem_ptr(arg_curr) &&
- arg_type_is_mem_size(arg_next));
+ return (base_type(arg_curr) == ARG_PTR_TO_MEM) !=
+ arg_type_is_mem_size(arg_next);
}
static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
@@ -5929,7 +6415,7 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
* helper function specification.
*/
if (arg_type_is_mem_size(fn->arg1_type) ||
- arg_type_is_mem_ptr(fn->arg5_type) ||
+ base_type(fn->arg5_type) == ARG_PTR_TO_MEM ||
check_args_pair_invalid(fn->arg1_type, fn->arg2_type) ||
check_args_pair_invalid(fn->arg2_type, fn->arg3_type) ||
check_args_pair_invalid(fn->arg3_type, fn->arg4_type) ||
@@ -5971,17 +6457,18 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn)
int i;
for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) {
- if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i])
+ if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i])
return false;
- if (fn->arg_type[i] != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i])
+ if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i])
return false;
}
return true;
}
-static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
+static int check_func_proto(const struct bpf_func_proto *fn, int func_id,
+ struct bpf_call_arg_meta *meta)
{
return check_raw_mode_ok(fn) &&
check_arg_pair_ok(fn) &&
@@ -6476,7 +6963,8 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
func_id != BPF_FUNC_map_pop_elem &&
func_id != BPF_FUNC_map_peek_elem &&
func_id != BPF_FUNC_for_each_map_elem &&
- func_id != BPF_FUNC_redirect_map)
+ func_id != BPF_FUNC_redirect_map &&
+ func_id != BPF_FUNC_map_lookup_percpu_elem)
return 0;
if (map == NULL) {
@@ -6665,7 +7153,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
memset(&meta, 0, sizeof(meta));
meta.pkt_access = fn->pkt_access;
- err = check_func_proto(fn, func_id);
+ err = check_func_proto(fn, func_id, &meta);
if (err) {
verbose(env, "kernel subsystem misconfigured func %s#%d\n",
func_id_name(func_id), func_id);
@@ -6698,8 +7186,35 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return err;
}
- if (is_release_function(func_id)) {
- err = release_reference(env, meta.ref_obj_id);
+ regs = cur_regs(env);
+
+ if (meta.uninit_dynptr_regno) {
+ /* we write BPF_DW bits (8 bytes) at a time */
+ for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
+ err = check_mem_access(env, insn_idx, meta.uninit_dynptr_regno,
+ i, BPF_DW, BPF_WRITE, -1, false);
+ if (err)
+ return err;
+ }
+
+ err = mark_stack_slots_dynptr(env, &regs[meta.uninit_dynptr_regno],
+ fn->arg_type[meta.uninit_dynptr_regno - BPF_REG_1],
+ insn_idx);
+ if (err)
+ return err;
+ }
+
+ if (meta.release_regno) {
+ err = -EINVAL;
+ if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1]))
+ err = unmark_stack_slots_dynptr(env, &regs[meta.release_regno]);
+ else if (meta.ref_obj_id)
+ err = release_reference(env, meta.ref_obj_id);
+ /* meta.ref_obj_id can only be 0 if register that is meant to be
+ * released is NULL, which must be > R0.
+ */
+ else if (register_is_null(&regs[meta.release_regno]))
+ err = 0;
if (err) {
verbose(env, "func %s#%d reference has not been acquired before\n",
func_id_name(func_id), func_id);
@@ -6707,8 +7222,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
}
- regs = cur_regs(env);
-
switch (func_id) {
case BPF_FUNC_tail_call:
err = check_reference_leak(env);
@@ -6745,6 +7258,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
set_loop_callback_state);
break;
+ case BPF_FUNC_dynptr_from_mem:
+ if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) {
+ verbose(env, "Unsupported reg type %s for bpf_dynptr_from_mem data\n",
+ reg_type_str(env, regs[BPF_REG_1].type));
+ return -EACCES;
+ }
}
if (err)
@@ -6832,21 +7351,25 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs[BPF_REG_0].btf_id = meta.ret_btf_id;
}
} else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) {
+ struct btf *ret_btf;
int ret_btf_id;
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
- ret_btf_id = *fn->ret_btf_id;
+ if (func_id == BPF_FUNC_kptr_xchg) {
+ ret_btf = meta.kptr_off_desc->kptr.btf;
+ ret_btf_id = meta.kptr_off_desc->kptr.btf_id;
+ } else {
+ ret_btf = btf_vmlinux;
+ ret_btf_id = *fn->ret_btf_id;
+ }
if (ret_btf_id == 0) {
verbose(env, "invalid return type %u of func %s#%d\n",
base_type(ret_type), func_id_name(func_id),
func_id);
return -EINVAL;
}
- /* current BPF helper definitions are only coming from
- * built-in code with type IDs from vmlinux BTF
- */
- regs[BPF_REG_0].btf = btf_vmlinux;
+ regs[BPF_REG_0].btf = ret_btf;
regs[BPF_REG_0].btf_id = ret_btf_id;
} else {
verbose(env, "unknown return type %u of func %s#%d\n",
@@ -6869,6 +7392,21 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs[BPF_REG_0].id = id;
/* For release_reference() */
regs[BPF_REG_0].ref_obj_id = id;
+ } else if (func_id == BPF_FUNC_dynptr_data) {
+ int dynptr_id = 0, i;
+
+ /* Find the id of the dynptr we're acquiring a reference to */
+ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
+ if (arg_type_is_dynptr(fn->arg_type[i])) {
+ if (dynptr_id) {
+ verbose(env, "verifier internal error: multiple dynptr args in func\n");
+ return -EFAULT;
+ }
+ dynptr_id = stack_slot_get_id(env, &regs[BPF_REG_1 + i]);
+ }
+ }
+ /* For release_reference() */
+ regs[BPF_REG_0].ref_obj_id = dynptr_id;
}
do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
@@ -6951,7 +7489,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (!insn->imm)
return 0;
- desc_btf = find_kfunc_desc_btf(env, insn->imm, insn->off);
+ desc_btf = find_kfunc_desc_btf(env, insn->off);
if (IS_ERR(desc_btf))
return PTR_ERR(desc_btf);
@@ -7433,7 +7971,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env,
return -EACCES;
break;
case PTR_TO_MAP_VALUE:
- if (check_map_access(env, dst, dst_reg->off, 1, false)) {
+ if (check_map_access(env, dst, dst_reg->off, 1, false, ACCESS_HELPER)) {
verbose(env, "R%d pointer arithmetic of map value goes out of range, "
"prohibited for !root\n", dst);
return -EACCES;
@@ -12822,7 +13360,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
if (!ctx_access)
continue;
- switch (env->insn_aux_data[i + delta].ptr_type) {
+ switch ((int)env->insn_aux_data[i + delta].ptr_type) {
case PTR_TO_CTX:
if (!ops->convert_ctx_access)
continue;
@@ -12839,6 +13377,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
break;
case PTR_TO_BTF_ID:
+ case PTR_TO_BTF_ID | PTR_UNTRUSTED:
if (type == BPF_READ) {
insn->code = BPF_LDX | BPF_PROBE_MEM |
BPF_SIZE((insn)->code);
@@ -13524,7 +14063,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
insn->imm == BPF_FUNC_map_pop_elem ||
insn->imm == BPF_FUNC_map_peek_elem ||
insn->imm == BPF_FUNC_redirect_map ||
- insn->imm == BPF_FUNC_for_each_map_elem)) {
+ insn->imm == BPF_FUNC_for_each_map_elem ||
+ insn->imm == BPF_FUNC_map_lookup_percpu_elem)) {
aux = &env->insn_aux_data[i + delta];
if (bpf_map_ptr_poisoned(aux))
goto patch_call_imm;
@@ -13573,6 +14113,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
bpf_callback_t callback_fn,
void *callback_ctx,
u64 flags))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_lookup_percpu_elem,
+ (void *(*)(struct bpf_map *map, void *key, u32 cpu))NULL));
patch_map_ops_generic:
switch (insn->imm) {
@@ -13600,6 +14142,9 @@ patch_map_ops_generic:
case BPF_FUNC_for_each_map_elem:
insn->imm = BPF_CALL_IMM(ops->map_for_each_callback);
continue;
+ case BPF_FUNC_map_lookup_percpu_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem);
+ continue;
}
goto patch_call_imm;
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 6e36e854b512..5da09c74228d 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -12,7 +12,6 @@
#define TRACE_CGROUP_PATH_LEN 1024
extern spinlock_t trace_cgroup_path_lock;
extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
-extern bool cgroup_debug;
extern void __init enable_debug_cgroup(void);
/*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index adb820e98f24..1779ccddb734 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -96,7 +96,7 @@ EXPORT_SYMBOL_GPL(css_set_lock);
DEFINE_SPINLOCK(trace_cgroup_path_lock);
char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
-bool cgroup_debug __read_mostly;
+static bool cgroup_debug __read_mostly;
/*
* Protects cgroup_idr and css_idr so that IDs can be released without
@@ -5685,7 +5685,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
css_clear_dir(&cgrp->self);
kernfs_remove(cgrp->kn);
- if (parent && cgroup_is_threaded(cgrp))
+ if (cgroup_is_threaded(cgrp))
parent->nr_threaded_children--;
spin_lock_irq(&css_set_lock);
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 4d57c03714f4..71122e01623c 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -222,9 +222,6 @@ next:
p = strstr(p+1, name);
}
- if (!ck_cmdline)
- return NULL;
-
return ck_cmdline;
}
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index da06a5553835..7beceb447211 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -53,6 +53,7 @@
#include <linux/vmacache.h>
#include <linux/rcupdate.h>
#include <linux/irq.h>
+#include <linux/security.h>
#include <asm/cacheflush.h>
#include <asm/byteorder.h>
@@ -752,6 +753,29 @@ cpu_master_loop:
continue;
kgdb_connected = 0;
} else {
+ /*
+ * This is a brutal way to interfere with the debugger
+ * and prevent gdb being used to poke at kernel memory.
+ * This could cause trouble if lockdown is applied when
+ * there is already an active gdb session. For now the
+ * answer is simply "don't do that". Typically lockdown
+ * *will* be applied before the debug core gets started
+ * so only developers using kgdb for fairly advanced
+ * early kernel debug can be biten by this. Hopefully
+ * they are sophisticated enough to take care of
+ * themselves, especially with help from the lockdown
+ * message printed on the console!
+ */
+ if (security_locked_down(LOCKDOWN_DBG_WRITE_KERNEL)) {
+ if (IS_ENABLED(CONFIG_KGDB_KDB)) {
+ /* Switch back to kdb if possible... */
+ dbg_kdb_mode = 1;
+ continue;
+ } else {
+ /* ... otherwise just bail */
+ break;
+ }
+ }
error = gdb_serial_stub(ks);
}
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 6735ac36b718..67d3c48a1522 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -9,7 +9,6 @@
* Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
*/
-#include <linux/module.h>
#include <linux/types.h>
#include <linux/ctype.h>
#include <linux/kernel.h>
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
index f877a0a0d7cf..f87c750d3eb3 100644
--- a/kernel/debug/kdb/kdb_keyboard.c
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -11,7 +11,6 @@
#include <linux/kdb.h>
#include <linux/keyboard.h>
#include <linux/ctype.h>
-#include <linux/module.h>
#include <linux/io.h>
/* Keyboard Controller Registers on normal PCs. */
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 0852a537dad4..438b868cbfa9 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -26,7 +26,6 @@
#include <linux/utsname.h>
#include <linux/vmalloc.h>
#include <linux/atomic.h>
-#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/mm.h>
#include <linux/init.h>
@@ -45,6 +44,7 @@
#include <linux/proc_fs.h>
#include <linux/uaccess.h>
#include <linux/slab.h>
+#include <linux/security.h>
#include "kdb_private.h"
#undef MODULE_PARAM_PREFIX
@@ -166,10 +166,62 @@ struct task_struct *kdb_curr_task(int cpu)
}
/*
- * Check whether the flags of the current command and the permissions
- * of the kdb console has allow a command to be run.
+ * Update the permissions flags (kdb_cmd_enabled) to match the
+ * current lockdown state.
+ *
+ * Within this function the calls to security_locked_down() are "lazy". We
+ * avoid calling them if the current value of kdb_cmd_enabled already excludes
+ * flags that might be subject to lockdown. Additionally we deliberately check
+ * the lockdown flags independently (even though read lockdown implies write
+ * lockdown) since that results in both simpler code and clearer messages to
+ * the user on first-time debugger entry.
+ *
+ * The permission masks during a read+write lockdown permits the following
+ * flags: INSPECT, SIGNAL, REBOOT (and ALWAYS_SAFE).
+ *
+ * The INSPECT commands are not blocked during lockdown because they are
+ * not arbitrary memory reads. INSPECT covers the backtrace family (sometimes
+ * forcing them to have no arguments) and lsmod. These commands do expose
+ * some kernel state but do not allow the developer seated at the console to
+ * choose what state is reported. SIGNAL and REBOOT should not be controversial,
+ * given these are allowed for root during lockdown already.
+ */
+static void kdb_check_for_lockdown(void)
+{
+ const int write_flags = KDB_ENABLE_MEM_WRITE |
+ KDB_ENABLE_REG_WRITE |
+ KDB_ENABLE_FLOW_CTRL;
+ const int read_flags = KDB_ENABLE_MEM_READ |
+ KDB_ENABLE_REG_READ;
+
+ bool need_to_lockdown_write = false;
+ bool need_to_lockdown_read = false;
+
+ if (kdb_cmd_enabled & (KDB_ENABLE_ALL | write_flags))
+ need_to_lockdown_write =
+ security_locked_down(LOCKDOWN_DBG_WRITE_KERNEL);
+
+ if (kdb_cmd_enabled & (KDB_ENABLE_ALL | read_flags))
+ need_to_lockdown_read =
+ security_locked_down(LOCKDOWN_DBG_READ_KERNEL);
+
+ /* De-compose KDB_ENABLE_ALL if required */
+ if (need_to_lockdown_write || need_to_lockdown_read)
+ if (kdb_cmd_enabled & KDB_ENABLE_ALL)
+ kdb_cmd_enabled = KDB_ENABLE_MASK & ~KDB_ENABLE_ALL;
+
+ if (need_to_lockdown_write)
+ kdb_cmd_enabled &= ~write_flags;
+
+ if (need_to_lockdown_read)
+ kdb_cmd_enabled &= ~read_flags;
+}
+
+/*
+ * Check whether the flags of the current command, the permissions of the kdb
+ * console and the lockdown state allow a command to be run.
*/
-static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions,
+static bool kdb_check_flags(kdb_cmdflags_t flags, int permissions,
bool no_args)
{
/* permissions comes from userspace so needs massaging slightly */
@@ -1180,6 +1232,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
kdb_curr_task(raw_smp_processor_id());
KDB_DEBUG_STATE("kdb_local 1", reason);
+
+ kdb_check_for_lockdown();
+
kdb_go_count = 0;
if (reason == KDB_REASON_DEBUG) {
/* special case below */
@@ -2004,54 +2059,6 @@ static int kdb_ef(int argc, const char **argv)
return 0;
}
-#if defined(CONFIG_MODULES)
-/*
- * kdb_lsmod - This function implements the 'lsmod' command. Lists
- * currently loaded kernel modules.
- * Mostly taken from userland lsmod.
- */
-static int kdb_lsmod(int argc, const char **argv)
-{
- struct module *mod;
-
- if (argc != 0)
- return KDB_ARGCOUNT;
-
- kdb_printf("Module Size modstruct Used by\n");
- list_for_each_entry(mod, kdb_modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
-
- kdb_printf("%-20s%8u 0x%px ", mod->name,
- mod->core_layout.size, (void *)mod);
-#ifdef CONFIG_MODULE_UNLOAD
- kdb_printf("%4d ", module_refcount(mod));
-#endif
- if (mod->state == MODULE_STATE_GOING)
- kdb_printf(" (Unloading)");
- else if (mod->state == MODULE_STATE_COMING)
- kdb_printf(" (Loading)");
- else
- kdb_printf(" (Live)");
- kdb_printf(" 0x%px", mod->core_layout.base);
-
-#ifdef CONFIG_MODULE_UNLOAD
- {
- struct module_use *use;
- kdb_printf(" [ ");
- list_for_each_entry(use, &mod->source_list,
- source_list)
- kdb_printf("%s ", use->target->name);
- kdb_printf("]\n");
- }
-#endif
- }
-
- return 0;
-}
-
-#endif /* CONFIG_MODULES */
-
/*
* kdb_env - This function implements the 'env' command. Display the
* current environment variables.
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 0d2f9feea0a4..1f8c519a5f81 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -226,10 +226,6 @@ extern void kdb_kbd_cleanup_state(void);
#define kdb_kbd_cleanup_state()
#endif /* ! CONFIG_KDB_KEYBOARD */
-#ifdef CONFIG_MODULES
-extern struct list_head *kdb_modules;
-#endif /* CONFIG_MODULES */
-
extern char kdb_prompt_str[];
#define KDB_WORD_SIZE ((int)sizeof(unsigned long))
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 85cb51c4a17e..0a39497140bf 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -17,7 +17,6 @@
#include <linux/stddef.h>
#include <linux/vmalloc.h>
#include <linux/ptrace.h>
-#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/hardirq.h>
#include <linux/delay.h>
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index c5e8cea9e05f..2c1e18f7c5cf 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -44,7 +44,7 @@ void delayacct_init(void)
}
#ifdef CONFIG_PROC_SYSCTL
-int sysctl_delayacct(struct ctl_table *table, int write, void *buffer,
+static int sysctl_delayacct(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int state = delayacct_on;
@@ -63,6 +63,26 @@ int sysctl_delayacct(struct ctl_table *table, int write, void *buffer,
set_delayacct(state);
return err;
}
+
+static struct ctl_table kern_delayacct_table[] = {
+ {
+ .procname = "task_delayacct",
+ .data = NULL,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_delayacct,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ { }
+};
+
+static __init int kernel_delayacct_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_delayacct_table);
+ return 0;
+}
+late_initcall(kernel_delayacct_sysctls_init);
#endif
void __delayacct_tsk_init(struct task_struct *tsk)
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index f8ff598596b8..ac740630c79c 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -448,7 +448,7 @@ void debug_dma_dump_mappings(struct device *dev)
* other hand, consumes a single dma_debug_entry, but inserts 'nents'
* entries into the tree.
*/
-static RADIX_TREE(dma_active_cacheline, GFP_NOWAIT);
+static RADIX_TREE(dma_active_cacheline, GFP_ATOMIC);
static DEFINE_SPINLOCK(radix_lock);
#define ACTIVE_CACHELINE_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1)
#define CACHELINE_PER_PAGE_SHIFT (PAGE_SHIFT - L1_CACHE_SHIFT)
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 9743c6ccce1a..e978f36e6be8 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -79,7 +79,7 @@ static int dma_set_decrypted(struct device *dev, void *vaddr, size_t size)
{
if (!force_dma_unencrypted(dev))
return 0;
- return set_memory_decrypted((unsigned long)vaddr, 1 << get_order(size));
+ return set_memory_decrypted((unsigned long)vaddr, PFN_UP(size));
}
static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size)
@@ -88,7 +88,7 @@ static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size)
if (!force_dma_unencrypted(dev))
return 0;
- ret = set_memory_encrypted((unsigned long)vaddr, 1 << get_order(size));
+ ret = set_memory_encrypted((unsigned long)vaddr, PFN_UP(size));
if (ret)
pr_warn_ratelimited("leaking DMA memory that can't be re-encrypted\n");
return ret;
@@ -115,7 +115,7 @@ static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size)
}
static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
- gfp_t gfp)
+ gfp_t gfp, bool allow_highmem)
{
int node = dev_to_node(dev);
struct page *page = NULL;
@@ -129,9 +129,12 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
&phys_limit);
page = dma_alloc_contiguous(dev, size, gfp);
- if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
- dma_free_contiguous(dev, page, size);
- page = NULL;
+ if (page) {
+ if (!dma_coherent_ok(dev, page_to_phys(page), size) ||
+ (!allow_highmem && PageHighMem(page))) {
+ dma_free_contiguous(dev, page, size);
+ page = NULL;
+ }
}
again:
if (!page)
@@ -189,7 +192,7 @@ static void *dma_direct_alloc_no_mapping(struct device *dev, size_t size,
{
struct page *page;
- page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
+ page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true);
if (!page)
return NULL;
@@ -262,7 +265,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
/* we always manually zero the memory once we are done */
- page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
+ page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true);
if (!page)
return NULL;
@@ -370,19 +373,9 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
- page = __dma_direct_alloc_pages(dev, size, gfp);
+ page = __dma_direct_alloc_pages(dev, size, gfp, false);
if (!page)
return NULL;
- if (PageHighMem(page)) {
- /*
- * Depending on the cma= arguments and per-arch setup
- * dma_alloc_contiguous could return highmem pages.
- * Without remapping there is no way to return them here,
- * so log an error and fail.
- */
- dev_info(dev, "Rejecting highmem page from CMA.\n");
- goto out_free_pages;
- }
ret = page_address(page);
if (dma_set_decrypted(dev, ret, size))
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index 8a6cd53dbe8c..a78c0ba70645 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -91,7 +91,7 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev,
return swiotlb_map(dev, phys, size, dir, attrs);
if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
- if (swiotlb_force != SWIOTLB_NO_FORCE)
+ if (is_swiotlb_active(dev))
return swiotlb_map(dev, phys, size, dir, attrs);
dev_WARN_ONCE(dev, 1,
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 73a41cec9e38..dfa1de89dc94 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -62,18 +62,13 @@
#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
-enum swiotlb_force swiotlb_force;
+static bool swiotlb_force_bounce;
+static bool swiotlb_force_disable;
struct io_tlb_mem io_tlb_default_mem;
phys_addr_t swiotlb_unencrypted_base;
-/*
- * Max segment that we can provide which (if pages are contingous) will
- * not be bounced (unless SWIOTLB_FORCE is set).
- */
-static unsigned int max_segment;
-
static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;
static int __init
@@ -87,9 +82,9 @@ setup_io_tlb_npages(char *str)
if (*str == ',')
++str;
if (!strcmp(str, "force"))
- swiotlb_force = SWIOTLB_FORCE;
+ swiotlb_force_bounce = true;
else if (!strcmp(str, "noforce"))
- swiotlb_force = SWIOTLB_NO_FORCE;
+ swiotlb_force_disable = true;
return 0;
}
@@ -97,18 +92,12 @@ early_param("swiotlb", setup_io_tlb_npages);
unsigned int swiotlb_max_segment(void)
{
- return io_tlb_default_mem.nslabs ? max_segment : 0;
+ if (!io_tlb_default_mem.nslabs)
+ return 0;
+ return rounddown(io_tlb_default_mem.nslabs << IO_TLB_SHIFT, PAGE_SIZE);
}
EXPORT_SYMBOL_GPL(swiotlb_max_segment);
-void swiotlb_set_max_segment(unsigned int val)
-{
- if (swiotlb_force == SWIOTLB_FORCE)
- max_segment = 1;
- else
- max_segment = rounddown(val, PAGE_SIZE);
-}
-
unsigned long swiotlb_size_or_default(void)
{
return default_nslabs << IO_TLB_SHIFT;
@@ -214,7 +203,7 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
mem->index = 0;
mem->late_alloc = late_alloc;
- if (swiotlb_force == SWIOTLB_FORCE)
+ if (swiotlb_force_bounce)
mem->force_bounce = true;
spin_lock_init(&mem->lock);
@@ -236,17 +225,49 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
return;
}
-int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
+/*
+ * Statically reserve bounce buffer space and initialize bounce buffer data
+ * structures for the software IO TLB used to implement the DMA API.
+ */
+void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
+ int (*remap)(void *tlb, unsigned long nslabs))
{
struct io_tlb_mem *mem = &io_tlb_default_mem;
+ unsigned long nslabs = default_nslabs;
size_t alloc_size;
+ size_t bytes;
+ void *tlb;
- if (swiotlb_force == SWIOTLB_NO_FORCE)
- return 0;
+ if (!addressing_limit && !swiotlb_force_bounce)
+ return;
+ if (swiotlb_force_disable)
+ return;
- /* protect against double initialization */
- if (WARN_ON_ONCE(mem->nslabs))
- return -ENOMEM;
+ /*
+ * By default allocate the bounce buffer memory from low memory, but
+ * allow to pick a location everywhere for hypervisors with guest
+ * memory encryption.
+ */
+retry:
+ bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT);
+ if (flags & SWIOTLB_ANY)
+ tlb = memblock_alloc(bytes, PAGE_SIZE);
+ else
+ tlb = memblock_alloc_low(bytes, PAGE_SIZE);
+ if (!tlb) {
+ pr_warn("%s: failed to allocate tlb structure\n", __func__);
+ return;
+ }
+
+ if (remap && remap(tlb, nslabs) < 0) {
+ memblock_free(tlb, PAGE_ALIGN(bytes));
+
+ nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
+ if (nslabs < IO_TLB_MIN_SLABS)
+ panic("%s: Failed to remap %zu bytes\n",
+ __func__, bytes);
+ goto retry;
+ }
alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs));
mem->slots = memblock_alloc(alloc_size, PAGE_SIZE);
@@ -255,38 +276,15 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
__func__, alloc_size, PAGE_SIZE);
swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, false);
+ mem->force_bounce = flags & SWIOTLB_FORCE;
- if (verbose)
+ if (flags & SWIOTLB_VERBOSE)
swiotlb_print_info();
- swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT);
- return 0;
}
-/*
- * Statically reserve bounce buffer space and initialize bounce buffer data
- * structures for the software IO TLB used to implement the DMA API.
- */
-void __init
-swiotlb_init(int verbose)
+void __init swiotlb_init(bool addressing_limit, unsigned int flags)
{
- size_t bytes = PAGE_ALIGN(default_nslabs << IO_TLB_SHIFT);
- void *tlb;
-
- if (swiotlb_force == SWIOTLB_NO_FORCE)
- return;
-
- /* Get IO TLB memory from the low pages */
- tlb = memblock_alloc_low(bytes, PAGE_SIZE);
- if (!tlb)
- goto fail;
- if (swiotlb_init_with_tbl(tlb, default_nslabs, verbose))
- goto fail_free_mem;
- return;
-
-fail_free_mem:
- memblock_free(tlb, bytes);
-fail:
- pr_warn("Cannot allocate buffer");
+ return swiotlb_init_remap(addressing_limit, flags, NULL);
}
/*
@@ -294,72 +292,65 @@ fail:
* initialize the swiotlb later using the slab allocator if needed.
* This should be just like above, but with some error catching.
*/
-int
-swiotlb_late_init_with_default_size(size_t default_size)
+int swiotlb_init_late(size_t size, gfp_t gfp_mask,
+ int (*remap)(void *tlb, unsigned long nslabs))
{
- unsigned long nslabs =
- ALIGN(default_size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
- unsigned long bytes;
+ struct io_tlb_mem *mem = &io_tlb_default_mem;
+ unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
unsigned char *vstart = NULL;
unsigned int order;
+ bool retried = false;
int rc = 0;
- if (swiotlb_force == SWIOTLB_NO_FORCE)
+ if (swiotlb_force_disable)
return 0;
- /*
- * Get IO TLB memory from the low pages
- */
+retry:
order = get_order(nslabs << IO_TLB_SHIFT);
nslabs = SLABS_PER_PAGE << order;
- bytes = nslabs << IO_TLB_SHIFT;
while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
- vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
+ vstart = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
order);
if (vstart)
break;
order--;
+ nslabs = SLABS_PER_PAGE << order;
+ retried = true;
}
if (!vstart)
return -ENOMEM;
- if (order != get_order(bytes)) {
- pr_warn("only able to allocate %ld MB\n",
- (PAGE_SIZE << order) >> 20);
- nslabs = SLABS_PER_PAGE << order;
- }
- rc = swiotlb_late_init_with_tbl(vstart, nslabs);
- if (rc)
+ if (remap)
+ rc = remap(vstart, nslabs);
+ if (rc) {
free_pages((unsigned long)vstart, order);
- return rc;
-}
-
-int
-swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
-{
- struct io_tlb_mem *mem = &io_tlb_default_mem;
- unsigned long bytes = nslabs << IO_TLB_SHIFT;
-
- if (swiotlb_force == SWIOTLB_NO_FORCE)
- return 0;
+ nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
+ if (nslabs < IO_TLB_MIN_SLABS)
+ return rc;
+ retried = true;
+ goto retry;
+ }
- /* protect against double initialization */
- if (WARN_ON_ONCE(mem->nslabs))
- return -ENOMEM;
+ if (retried) {
+ pr_warn("only able to allocate %ld MB\n",
+ (PAGE_SIZE << order) >> 20);
+ }
mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
get_order(array_size(sizeof(*mem->slots), nslabs)));
- if (!mem->slots)
+ if (!mem->slots) {
+ free_pages((unsigned long)vstart, order);
return -ENOMEM;
+ }
- set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT);
- swiotlb_init_io_tlb_mem(mem, virt_to_phys(tlb), nslabs, true);
+ set_memory_decrypted((unsigned long)vstart,
+ (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT);
+ swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, true);
swiotlb_print_info();
- swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT);
return 0;
}
@@ -369,6 +360,9 @@ void __init swiotlb_exit(void)
unsigned long tbl_vaddr;
size_t tbl_size, slots_size;
+ if (swiotlb_force_bounce)
+ return;
+
if (!mem->nslabs)
return;
@@ -717,8 +711,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
phys_addr_t swiotlb_addr;
dma_addr_t dma_addr;
- trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size,
- swiotlb_force);
+ trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size);
swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, 0, dir,
attrs);
@@ -743,7 +736,18 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
size_t swiotlb_max_mapping_size(struct device *dev)
{
- return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE;
+ int min_align_mask = dma_get_min_align_mask(dev);
+ int min_align = 0;
+
+ /*
+ * swiotlb_find_slots() skips slots according to
+ * min align mask. This affects max mapping size.
+ * Take it into acount here.
+ */
+ if (min_align_mask)
+ min_align = roundup(min_align_mask, IO_TLB_SIZE);
+
+ return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE - min_align;
}
bool is_swiotlb_active(struct device *dev)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 6418083901d4..2eaa327f8158 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -180,7 +180,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
if (new_page) {
get_page(new_page);
- page_add_new_anon_rmap(new_page, vma, addr, false);
+ page_add_new_anon_rmap(new_page, vma, addr);
lru_cache_add_inactive_or_unevictable(new_page, vma);
} else
/* no new page, just dec_mm_counter for old_page */
@@ -787,10 +787,10 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
struct page *page;
/*
* Ensure that the page that has the original instruction is populated
- * and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
+ * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(),
* see uprobe_register().
*/
- if (mapping->a_ops->readpage)
+ if (mapping->a_ops->read_folio)
page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
else
page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
@@ -1143,7 +1143,8 @@ static int __uprobe_register(struct inode *inode, loff_t offset,
return -EINVAL;
/* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
- if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
+ if (!inode->i_mapping->a_ops->read_folio &&
+ !shmem_mapping(inode->i_mapping))
return -EIO;
/* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))
diff --git a/kernel/fork.c b/kernel/fork.c
index 254ab63c1106..9d44f2d46c69 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -612,9 +612,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
retval = ksm_fork(mm, oldmm);
if (retval)
goto out;
- retval = khugepaged_fork(mm, oldmm);
- if (retval)
- goto out;
+ khugepaged_fork(mm, oldmm);
prev = NULL;
for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
@@ -1984,7 +1982,7 @@ static __latent_entropy struct task_struct *copy_process(
struct task_struct *p;
struct multiprocess_signals delayed;
struct file *pidfile = NULL;
- u64 clone_flags = args->flags;
+ const u64 clone_flags = args->flags;
struct nsproxy *nsp = current->nsproxy;
/*
@@ -2073,6 +2071,9 @@ static __latent_entropy struct task_struct *copy_process(
p = dup_task_struct(current, node);
if (!p)
goto fork_out;
+ p->flags &= ~PF_KTHREAD;
+ if (args->kthread)
+ p->flags |= PF_KTHREAD;
if (args->io_thread) {
/*
* Mark us an IO worker, and block any signal that isn't
@@ -2162,7 +2163,7 @@ static __latent_entropy struct task_struct *copy_process(
p->io_context = NULL;
audit_set_context(p, NULL);
cgroup_fork(p);
- if (p->flags & PF_KTHREAD) {
+ if (args->kthread) {
if (!set_kthread_struct(p))
goto bad_fork_cleanup_delayacct;
}
@@ -2245,7 +2246,7 @@ static __latent_entropy struct task_struct *copy_process(
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
- retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls);
+ retval = copy_thread(p, args);
if (retval)
goto bad_fork_cleanup_io;
@@ -2549,11 +2550,21 @@ static inline void init_idle_pids(struct task_struct *idle)
}
}
+static int idle_dummy(void *dummy)
+{
+ /* This function is never called */
+ return 0;
+}
+
struct task_struct * __init fork_idle(int cpu)
{
struct task_struct *task;
struct kernel_clone_args args = {
- .flags = CLONE_VM,
+ .flags = CLONE_VM,
+ .fn = &idle_dummy,
+ .fn_arg = NULL,
+ .kthread = 1,
+ .idle = 1,
};
task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
@@ -2584,8 +2595,8 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
.flags = ((lower_32_bits(flags) | CLONE_VM |
CLONE_UNTRACED) & ~CSIGNAL),
.exit_signal = (lower_32_bits(flags) & CSIGNAL),
- .stack = (unsigned long)fn,
- .stack_size = (unsigned long)arg,
+ .fn = fn,
+ .fn_arg = arg,
.io_thread = 1,
};
@@ -2689,8 +2700,25 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
.flags = ((lower_32_bits(flags) | CLONE_VM |
CLONE_UNTRACED) & ~CSIGNAL),
.exit_signal = (lower_32_bits(flags) & CSIGNAL),
- .stack = (unsigned long)fn,
- .stack_size = (unsigned long)arg,
+ .fn = fn,
+ .fn_arg = arg,
+ .kthread = 1,
+ };
+
+ return kernel_clone(&args);
+}
+
+/*
+ * Create a user mode thread.
+ */
+pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
+{
+ struct kernel_clone_args args = {
+ .flags = ((lower_32_bits(flags) | CLONE_VM |
+ CLONE_UNTRACED) & ~CSIGNAL),
+ .exit_signal = (lower_32_bits(flags) & CSIGNAL),
+ .fn = fn,
+ .fn_arg = arg,
};
return kernel_clone(&args);
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index c264cbeab71c..b5379c0e6d6d 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -3,6 +3,7 @@
#define _FUTEX_H
#include <linux/futex.h>
+#include <linux/rtmutex.h>
#include <linux/sched/wake_q.h>
#ifdef CONFIG_PREEMPT_RT
diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index 1966a749e0d9..0c78e64f747d 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -74,7 +74,7 @@ fi
# of tree builds having stale headers in srctree. Just silence CPIO for now.
for f in $dir_list;
do find "$f" -name "*.h";
-done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1
+done | cpio --quiet -pdu $cpio_dir >/dev/null 2>&1
# Remove comments except SDPX lines
find $cpio_dir -type f -print0 |
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 52501e5f7655..80bfea5dd5c4 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -73,7 +73,7 @@ static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
* hung task is detected:
*/
unsigned int __read_mostly sysctl_hung_task_panic =
- CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
+ IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC);
static int
hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -127,6 +127,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
* complain:
*/
if (sysctl_hung_task_warnings) {
+ printk_prefer_direct_enter();
+
if (sysctl_hung_task_warnings > 0)
sysctl_hung_task_warnings--;
pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
@@ -142,6 +144,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
if (sysctl_hung_task_all_cpu_backtrace)
hung_task_show_all_bt = true;
+
+ printk_prefer_direct_exit();
}
touch_nmi_watchdog();
@@ -204,12 +208,17 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
}
unlock:
rcu_read_unlock();
- if (hung_task_show_lock)
+ if (hung_task_show_lock) {
+ printk_prefer_direct_enter();
debug_show_all_locks();
+ printk_prefer_direct_exit();
+ }
if (hung_task_show_all_bt) {
hung_task_show_all_bt = false;
+ printk_prefer_direct_enter();
trigger_all_cpu_backtrace();
+ printk_prefer_direct_exit();
}
if (hung_task_call_panic)
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 79f2eb617a62..fbdf8d3279ac 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -29,6 +29,7 @@
#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/kernel.h>
+#include <linux/bsearch.h>
/*
* These will be re-linked against their real values
@@ -228,7 +229,6 @@ unsigned long kallsyms_lookup_name(const char *name)
return module_kallsyms_lookup_name(name);
}
-#ifdef CONFIG_LIVEPATCH
/*
* Iterate over all symbols in vmlinux. For symbols from modules use
* module_kallsyms_on_each_symbol instead.
@@ -251,7 +251,6 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
}
return 0;
}
-#endif /* CONFIG_LIVEPATCH */
static unsigned long get_symbol_pos(unsigned long addr,
unsigned long *symbolsize,
diff --git a/kernel/kcov.c b/kernel/kcov.c
index b3732b210593..e19c84b02452 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -204,8 +204,16 @@ void notrace __sanitizer_cov_trace_pc(void)
/* The first 64-bit word is the number of subsequent PCs. */
pos = READ_ONCE(area[0]) + 1;
if (likely(pos < t->kcov_size)) {
- area[pos] = ip;
+ /* Previously we write pc before updating pos. However, some
+ * early interrupt code could bypass check_kcov_mode() check
+ * and invoke __sanitizer_cov_trace_pc(). If such interrupt is
+ * raised between writing pc and updating pos, the pc could be
+ * overitten by the recursive __sanitizer_cov_trace_pc().
+ * Update pos before writing pc to avoid such interleaving.
+ */
WRITE_ONCE(area[0], pos);
+ barrier();
+ area[pos] = ip;
}
}
EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
@@ -236,11 +244,13 @@ static void notrace write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip)
start_index = 1 + count * KCOV_WORDS_PER_CMP;
end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64);
if (likely(end_pos <= max_pos)) {
+ /* See comment in __sanitizer_cov_trace_pc(). */
+ WRITE_ONCE(area[0], count + 1);
+ barrier();
area[start_index] = type;
area[start_index + 1] = arg1;
area[start_index + 2] = arg2;
area[start_index + 3] = ip;
- WRITE_ONCE(area[0], count + 1);
}
}
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index 767dfacd6ed3..dcec1b743c69 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -1566,14 +1566,6 @@ static void test_exit(struct kunit *test)
torture_cleanup_end();
}
-static struct kunit_suite kcsan_test_suite = {
- .name = "kcsan",
- .test_cases = kcsan_test_cases,
- .init = test_init,
- .exit = test_exit,
-};
-static struct kunit_suite *kcsan_test_suites[] = { &kcsan_test_suite, NULL };
-
__no_kcsan
static void register_tracepoints(struct tracepoint *tp, void *ignore)
{
@@ -1589,11 +1581,7 @@ static void unregister_tracepoints(struct tracepoint *tp, void *ignore)
tracepoint_probe_unregister(tp, probe_console, NULL);
}
-/*
- * We only want to do tracepoints setup and teardown once, therefore we have to
- * customize the init and exit functions and cannot rely on kunit_test_suite().
- */
-static int __init kcsan_test_init(void)
+static int kcsan_suite_init(struct kunit_suite *suite)
{
/*
* Because we want to be able to build the test as a module, we need to
@@ -1601,18 +1589,25 @@ static int __init kcsan_test_init(void)
* won't work here.
*/
for_each_kernel_tracepoint(register_tracepoints, NULL);
- return __kunit_test_suites_init(kcsan_test_suites);
+ return 0;
}
-static void kcsan_test_exit(void)
+static void kcsan_suite_exit(struct kunit_suite *suite)
{
- __kunit_test_suites_exit(kcsan_test_suites);
for_each_kernel_tracepoint(unregister_tracepoints, NULL);
tracepoint_synchronize_unregister();
}
-late_initcall_sync(kcsan_test_init);
-module_exit(kcsan_test_exit);
+static struct kunit_suite kcsan_test_suite = {
+ .name = "kcsan",
+ .test_cases = kcsan_test_cases,
+ .init = test_init,
+ .exit = test_exit,
+ .suite_init = kcsan_suite_init,
+ .suite_exit = kcsan_suite_exit,
+};
+
+kunit_test_suites(&kcsan_test_suite);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Marco Elver <elver@google.com>");
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index be4b54c2c615..4d34c78334ce 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -768,7 +768,6 @@ static struct page *kimage_alloc_page(struct kimage *image,
kimage_free_pages(old_page);
continue;
}
- addr = old_addr;
page = old_page;
break;
}
@@ -788,7 +787,6 @@ static int kimage_load_normal_segment(struct kimage *image,
unsigned char __user *buf = NULL;
unsigned char *kbuf = NULL;
- result = 0;
if (image->file_mode)
kbuf = segment->kbuf;
else
@@ -936,6 +934,28 @@ int kimage_load_segment(struct kimage *image,
struct kimage *kexec_image;
struct kimage *kexec_crash_image;
int kexec_load_disabled;
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kexec_core_sysctls[] = {
+ {
+ .procname = "kexec_load_disabled",
+ .data = &kexec_load_disabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ /* only handle a transition from default "0" to "1" */
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_ONE,
+ },
+ { }
+};
+
+static int __init kexec_core_sysctl_init(void)
+{
+ register_sysctl_init("kernel", kexec_core_sysctls);
+ return 0;
+}
+late_initcall(kexec_core_sysctl_init);
+#endif
/*
* No panic_cpu check version of crash_kexec(). This function is called
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 8347fc158d2b..145321a5e798 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -109,40 +109,6 @@ int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
#endif
/*
- * arch_kexec_apply_relocations_add - apply relocations of type RELA
- * @pi: Purgatory to be relocated.
- * @section: Section relocations applying to.
- * @relsec: Section containing RELAs.
- * @symtab: Corresponding symtab.
- *
- * Return: 0 on success, negative errno on error.
- */
-int __weak
-arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section,
- const Elf_Shdr *relsec, const Elf_Shdr *symtab)
-{
- pr_err("RELA relocation unsupported.\n");
- return -ENOEXEC;
-}
-
-/*
- * arch_kexec_apply_relocations - apply relocations of type REL
- * @pi: Purgatory to be relocated.
- * @section: Section relocations applying to.
- * @relsec: Section containing RELs.
- * @symtab: Corresponding symtab.
- *
- * Return: 0 on success, negative errno on error.
- */
-int __weak
-arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section,
- const Elf_Shdr *relsec, const Elf_Shdr *symtab)
-{
- pr_err("REL relocation unsupported.\n");
- return -ENOEXEC;
-}
-
-/*
* Free up memory used by kernel, initrd, and command line. This is temporary
* memory allocation which is not needed any more after these buffers have
* been loaded into separate segments and have been copied elsewhere.
@@ -1260,7 +1226,7 @@ int crash_exclude_mem_range(struct crash_mem *mem,
return 0;
}
-int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
+int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
void **addr, unsigned long *sz)
{
Elf64_Ehdr *ehdr;
@@ -1324,7 +1290,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
phdr++;
/* Prepare PT_LOAD type program header for kernel text region */
- if (kernel_map) {
+ if (need_kernel_map) {
phdr->p_type = PT_LOAD;
phdr->p_flags = PF_R|PF_W|PF_X;
phdr->p_vaddr = (unsigned long) _text;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index dd58c0be9ce2..f214f8c088ed 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1257,79 +1257,6 @@ void kprobe_busy_end(void)
preempt_enable();
}
-#if !defined(CONFIG_KRETPROBE_ON_RETHOOK)
-static void free_rp_inst_rcu(struct rcu_head *head)
-{
- struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu);
-
- if (refcount_dec_and_test(&ri->rph->ref))
- kfree(ri->rph);
- kfree(ri);
-}
-NOKPROBE_SYMBOL(free_rp_inst_rcu);
-
-static void recycle_rp_inst(struct kretprobe_instance *ri)
-{
- struct kretprobe *rp = get_kretprobe(ri);
-
- if (likely(rp))
- freelist_add(&ri->freelist, &rp->freelist);
- else
- call_rcu(&ri->rcu, free_rp_inst_rcu);
-}
-NOKPROBE_SYMBOL(recycle_rp_inst);
-
-/*
- * This function is called from delayed_put_task_struct() when a task is
- * dead and cleaned up to recycle any kretprobe instances associated with
- * this task. These left over instances represent probed functions that
- * have been called but will never return.
- */
-void kprobe_flush_task(struct task_struct *tk)
-{
- struct kretprobe_instance *ri;
- struct llist_node *node;
-
- /* Early boot, not yet initialized. */
- if (unlikely(!kprobes_initialized))
- return;
-
- kprobe_busy_begin();
-
- node = __llist_del_all(&tk->kretprobe_instances);
- while (node) {
- ri = container_of(node, struct kretprobe_instance, llist);
- node = node->next;
-
- recycle_rp_inst(ri);
- }
-
- kprobe_busy_end();
-}
-NOKPROBE_SYMBOL(kprobe_flush_task);
-
-static inline void free_rp_inst(struct kretprobe *rp)
-{
- struct kretprobe_instance *ri;
- struct freelist_node *node;
- int count = 0;
-
- node = rp->freelist.head;
- while (node) {
- ri = container_of(node, struct kretprobe_instance, freelist);
- node = node->next;
-
- kfree(ri);
- count++;
- }
-
- if (refcount_sub_and_test(count, &rp->rph->ref)) {
- kfree(rp->rph);
- rp->rph = NULL;
- }
-}
-#endif /* !CONFIG_KRETPROBE_ON_RETHOOK */
-
/* Add the new probe to 'ap->list'. */
static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
{
@@ -1928,6 +1855,77 @@ static struct notifier_block kprobe_exceptions_nb = {
#ifdef CONFIG_KRETPROBES
#if !defined(CONFIG_KRETPROBE_ON_RETHOOK)
+static void free_rp_inst_rcu(struct rcu_head *head)
+{
+ struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu);
+
+ if (refcount_dec_and_test(&ri->rph->ref))
+ kfree(ri->rph);
+ kfree(ri);
+}
+NOKPROBE_SYMBOL(free_rp_inst_rcu);
+
+static void recycle_rp_inst(struct kretprobe_instance *ri)
+{
+ struct kretprobe *rp = get_kretprobe(ri);
+
+ if (likely(rp))
+ freelist_add(&ri->freelist, &rp->freelist);
+ else
+ call_rcu(&ri->rcu, free_rp_inst_rcu);
+}
+NOKPROBE_SYMBOL(recycle_rp_inst);
+
+/*
+ * This function is called from delayed_put_task_struct() when a task is
+ * dead and cleaned up to recycle any kretprobe instances associated with
+ * this task. These left over instances represent probed functions that
+ * have been called but will never return.
+ */
+void kprobe_flush_task(struct task_struct *tk)
+{
+ struct kretprobe_instance *ri;
+ struct llist_node *node;
+
+ /* Early boot, not yet initialized. */
+ if (unlikely(!kprobes_initialized))
+ return;
+
+ kprobe_busy_begin();
+
+ node = __llist_del_all(&tk->kretprobe_instances);
+ while (node) {
+ ri = container_of(node, struct kretprobe_instance, llist);
+ node = node->next;
+
+ recycle_rp_inst(ri);
+ }
+
+ kprobe_busy_end();
+}
+NOKPROBE_SYMBOL(kprobe_flush_task);
+
+static inline void free_rp_inst(struct kretprobe *rp)
+{
+ struct kretprobe_instance *ri;
+ struct freelist_node *node;
+ int count = 0;
+
+ node = rp->freelist.head;
+ while (node) {
+ ri = container_of(node, struct kretprobe_instance, freelist);
+ node = node->next;
+
+ kfree(ri);
+ count++;
+ }
+
+ if (refcount_sub_and_test(count, &rp->rph->ref)) {
+ kfree(rp->rph);
+ rp->rph = NULL;
+ }
+}
+
/* This assumes the 'tsk' is the current task or the is not running. */
static kprobe_opcode_t *__kretprobe_find_ret_addr(struct task_struct *tsk,
struct llist_node **cur)
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 166d7bf49666..76166df011a4 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -55,6 +55,7 @@
#include <linux/sched/stat.h>
#include <linux/list.h>
#include <linux/stacktrace.h>
+#include <linux/sysctl.h>
static DEFINE_RAW_SPINLOCK(latency_lock);
@@ -63,6 +64,31 @@ static struct latency_record latency_record[MAXLR];
int latencytop_enabled;
+#ifdef CONFIG_SYSCTL
+static int sysctl_latencytop(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int err;
+
+ err = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (latencytop_enabled)
+ force_schedstat_enabled();
+
+ return err;
+}
+
+static struct ctl_table latencytop_sysctl[] = {
+ {
+ .procname = "latencytop",
+ .data = &latencytop_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sysctl_latencytop,
+ },
+ {}
+};
+#endif
+
void clear_tsk_latency_tracing(struct task_struct *p)
{
unsigned long flags;
@@ -266,18 +292,9 @@ static const struct proc_ops lstats_proc_ops = {
static int __init init_lstats_procfs(void)
{
proc_create("latency_stats", 0644, NULL, &lstats_proc_ops);
+#ifdef CONFIG_SYSCTL
+ register_sysctl_init("kernel", latencytop_sysctl);
+#endif
return 0;
}
-
-int sysctl_latencytop(struct ctl_table *table, int write, void *buffer,
- size_t *lenp, loff_t *ppos)
-{
- int err;
-
- err = proc_dointvec(table, write, buffer, lenp, ppos);
- if (latencytop_enabled)
- force_schedstat_enabled();
-
- return err;
-}
device_initcall(init_lstats_procfs);
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index c172bf92b576..4c4f5a776d80 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -118,7 +118,7 @@ static void notrace klp_ftrace_handler(unsigned long ip,
if (func->nop)
goto unlock;
- klp_arch_set_pc(fregs, (unsigned long)func->new_func);
+ ftrace_instruction_pointer_set(fregs, (unsigned long)func->new_func);
unlock:
ftrace_test_recursion_unlock(bit);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index a6e671b8608d..81e87280513e 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -63,19 +63,50 @@
#include <trace/events/lock.h>
#ifdef CONFIG_PROVE_LOCKING
-int prove_locking = 1;
+static int prove_locking = 1;
module_param(prove_locking, int, 0644);
#else
#define prove_locking 0
#endif
#ifdef CONFIG_LOCK_STAT
-int lock_stat = 1;
+static int lock_stat = 1;
module_param(lock_stat, int, 0644);
#else
#define lock_stat 0
#endif
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kern_lockdep_table[] = {
+#ifdef CONFIG_PROVE_LOCKING
+ {
+ .procname = "prove_locking",
+ .data = &prove_locking,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif /* CONFIG_PROVE_LOCKING */
+#ifdef CONFIG_LOCK_STAT
+ {
+ .procname = "lock_stat",
+ .data = &lock_stat,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif /* CONFIG_LOCK_STAT */
+ { }
+};
+
+static __init int kernel_lockdep_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_lockdep_table);
+ return 0;
+}
+late_initcall(kernel_lockdep_sysctls_init);
+#endif /* CONFIG_SYSCTL */
+
DEFINE_PER_CPU(unsigned int, lockdep_recursion);
EXPORT_PER_CPU_SYMBOL_GPL(lockdep_recursion);
diff --git a/kernel/module-internal.h b/kernel/module-internal.h
deleted file mode 100644
index 8c381c99062f..000000000000
--- a/kernel/module-internal.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/* Module internals
- *
- * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#include <linux/elf.h>
-#include <asm/module.h>
-
-struct load_info {
- const char *name;
- /* pointer to module in temporary copy, freed at end of load_module() */
- struct module *mod;
- Elf_Ehdr *hdr;
- unsigned long len;
- Elf_Shdr *sechdrs;
- char *secstrings, *strtab;
- unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs;
- struct _ddebug *debug;
- unsigned int num_debug;
- bool sig_ok;
-#ifdef CONFIG_KALLSYMS
- unsigned long mod_kallsyms_init_off;
-#endif
-#ifdef CONFIG_MODULE_DECOMPRESS
- struct page **pages;
- unsigned int max_pages;
- unsigned int used_pages;
-#endif
- struct {
- unsigned int sym, str, mod, vers, info, pcpu;
- } index;
-};
-
-extern int mod_verify_sig(const void *mod, struct load_info *info);
-
-#ifdef CONFIG_MODULE_DECOMPRESS
-int module_decompress(struct load_info *info, const void *buf, size_t size);
-void module_decompress_cleanup(struct load_info *info);
-#else
-static inline int module_decompress(struct load_info *info,
- const void *buf, size_t size)
-{
- return -EOPNOTSUPP;
-}
-static inline void module_decompress_cleanup(struct load_info *info)
-{
-}
-#endif
diff --git a/kernel/module/Makefile b/kernel/module/Makefile
new file mode 100644
index 000000000000..948efea81e85
--- /dev/null
+++ b/kernel/module/Makefile
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for linux kernel module support
+#
+
+# These are called from save_stack_trace() on slub debug path,
+# and produce insane amounts of uninteresting coverage.
+KCOV_INSTRUMENT_module.o := n
+
+obj-y += main.o strict_rwx.o
+obj-$(CONFIG_MODULE_DECOMPRESS) += decompress.o
+obj-$(CONFIG_MODULE_SIG) += signing.o
+obj-$(CONFIG_LIVEPATCH) += livepatch.o
+obj-$(CONFIG_MODULES_TREE_LOOKUP) += tree_lookup.o
+obj-$(CONFIG_DEBUG_KMEMLEAK) += debug_kmemleak.o
+obj-$(CONFIG_KALLSYMS) += kallsyms.o
+obj-$(CONFIG_PROC_FS) += procfs.o
+obj-$(CONFIG_SYSFS) += sysfs.o
+obj-$(CONFIG_KGDB_KDB) += kdb.o
+obj-$(CONFIG_MODVERSIONS) += version.o
+obj-$(CONFIG_MODULE_UNLOAD_TAINT_TRACKING) += tracking.o
diff --git a/kernel/module/debug_kmemleak.c b/kernel/module/debug_kmemleak.c
new file mode 100644
index 000000000000..12a569d361e8
--- /dev/null
+++ b/kernel/module/debug_kmemleak.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module kmemleak support
+ *
+ * Copyright (C) 2009 Catalin Marinas
+ */
+
+#include <linux/module.h>
+#include <linux/kmemleak.h>
+#include "internal.h"
+
+void kmemleak_load_module(const struct module *mod,
+ const struct load_info *info)
+{
+ unsigned int i;
+
+ /* only scan the sections containing data */
+ kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
+
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ /* Scan all writable sections that's not executable */
+ if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) ||
+ !(info->sechdrs[i].sh_flags & SHF_WRITE) ||
+ (info->sechdrs[i].sh_flags & SHF_EXECINSTR))
+ continue;
+
+ kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
+ info->sechdrs[i].sh_size, GFP_KERNEL);
+ }
+}
diff --git a/kernel/module_decompress.c b/kernel/module/decompress.c
index ffef98a20320..2fc7081dd7c1 100644
--- a/kernel/module_decompress.c
+++ b/kernel/module/decompress.c
@@ -12,7 +12,7 @@
#include <linux/sysfs.h>
#include <linux/vmalloc.h>
-#include "module-internal.h"
+#include "internal.h"
static int module_extend_max_pages(struct load_info *info, unsigned int extent)
{
@@ -113,6 +113,7 @@ static ssize_t module_gzip_decompress(struct load_info *info,
do {
struct page *page = module_get_next_page(info);
+
if (!page) {
retval = -ENOMEM;
goto out_inflate_end;
@@ -171,6 +172,7 @@ static ssize_t module_xz_decompress(struct load_info *info,
do {
struct page *page = module_get_next_page(info);
+
if (!page) {
retval = -ENOMEM;
goto out;
@@ -256,6 +258,7 @@ static ssize_t compression_show(struct kobject *kobj,
{
return sysfs_emit(buf, "%s\n", __stringify(MODULE_COMPRESSION));
}
+
static struct kobj_attribute module_compression_attr = __ATTR_RO(compression);
static int __init module_decompress_sysfs_init(void)
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
new file mode 100644
index 000000000000..bc5507ab8450
--- /dev/null
+++ b/kernel/module/internal.h
@@ -0,0 +1,302 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Module internals
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/elf.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+
+#ifndef ARCH_SHF_SMALL
+#define ARCH_SHF_SMALL 0
+#endif
+
+/* If this is set, the section belongs in the init part of the module */
+#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG - 1))
+/* Maximum number of characters written by module_flags() */
+#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
+
+#ifndef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+#define data_layout core_layout
+#endif
+
+/*
+ * Modules' sections will be aligned on page boundaries
+ * to ensure complete separation of code and data, but
+ * only when CONFIG_STRICT_MODULE_RWX=y
+ */
+#ifdef CONFIG_STRICT_MODULE_RWX
+# define strict_align(X) PAGE_ALIGN(X)
+#else
+# define strict_align(X) (X)
+#endif
+
+extern struct mutex module_mutex;
+extern struct list_head modules;
+
+extern struct module_attribute *modinfo_attrs[];
+extern size_t modinfo_attrs_count;
+
+/* Provided by the linker */
+extern const struct kernel_symbol __start___ksymtab[];
+extern const struct kernel_symbol __stop___ksymtab[];
+extern const struct kernel_symbol __start___ksymtab_gpl[];
+extern const struct kernel_symbol __stop___ksymtab_gpl[];
+extern const s32 __start___kcrctab[];
+extern const s32 __start___kcrctab_gpl[];
+
+struct load_info {
+ const char *name;
+ /* pointer to module in temporary copy, freed at end of load_module() */
+ struct module *mod;
+ Elf_Ehdr *hdr;
+ unsigned long len;
+ Elf_Shdr *sechdrs;
+ char *secstrings, *strtab;
+ unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs;
+ struct _ddebug *debug;
+ unsigned int num_debug;
+ bool sig_ok;
+#ifdef CONFIG_KALLSYMS
+ unsigned long mod_kallsyms_init_off;
+#endif
+#ifdef CONFIG_MODULE_DECOMPRESS
+ struct page **pages;
+ unsigned int max_pages;
+ unsigned int used_pages;
+#endif
+ struct {
+ unsigned int sym, str, mod, vers, info, pcpu;
+ } index;
+};
+
+enum mod_license {
+ NOT_GPL_ONLY,
+ GPL_ONLY,
+};
+
+struct find_symbol_arg {
+ /* Input */
+ const char *name;
+ bool gplok;
+ bool warn;
+
+ /* Output */
+ struct module *owner;
+ const s32 *crc;
+ const struct kernel_symbol *sym;
+ enum mod_license license;
+};
+
+int mod_verify_sig(const void *mod, struct load_info *info);
+int try_to_force_load(struct module *mod, const char *reason);
+bool find_symbol(struct find_symbol_arg *fsa);
+struct module *find_module_all(const char *name, size_t len, bool even_unformed);
+int cmp_name(const void *name, const void *sym);
+long module_get_offset(struct module *mod, unsigned int *size, Elf_Shdr *sechdr,
+ unsigned int section);
+char *module_flags(struct module *mod, char *buf);
+size_t module_flags_taint(unsigned long taints, char *buf);
+
+static inline void module_assert_mutex_or_preempt(void)
+{
+#ifdef CONFIG_LOCKDEP
+ if (unlikely(!debug_locks))
+ return;
+
+ WARN_ON_ONCE(!rcu_read_lock_sched_held() &&
+ !lockdep_is_held(&module_mutex));
+#endif
+}
+
+static inline unsigned long kernel_symbol_value(const struct kernel_symbol *sym)
+{
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ return (unsigned long)offset_to_ptr(&sym->value_offset);
+#else
+ return sym->value;
+#endif
+}
+
+#ifdef CONFIG_LIVEPATCH
+int copy_module_elf(struct module *mod, struct load_info *info);
+void free_module_elf(struct module *mod);
+#else /* !CONFIG_LIVEPATCH */
+static inline int copy_module_elf(struct module *mod, struct load_info *info)
+{
+ return 0;
+}
+
+static inline void free_module_elf(struct module *mod) { }
+#endif /* CONFIG_LIVEPATCH */
+
+static inline bool set_livepatch_module(struct module *mod)
+{
+#ifdef CONFIG_LIVEPATCH
+ mod->klp = true;
+ return true;
+#else
+ return false;
+#endif
+}
+
+#ifdef CONFIG_MODULE_UNLOAD_TAINT_TRACKING
+struct mod_unload_taint {
+ struct list_head list;
+ char name[MODULE_NAME_LEN];
+ unsigned long taints;
+ u64 count;
+};
+
+int try_add_tainted_module(struct module *mod);
+void print_unloaded_tainted_modules(void);
+#else /* !CONFIG_MODULE_UNLOAD_TAINT_TRACKING */
+static inline int try_add_tainted_module(struct module *mod)
+{
+ return 0;
+}
+
+static inline void print_unloaded_tainted_modules(void)
+{
+}
+#endif /* CONFIG_MODULE_UNLOAD_TAINT_TRACKING */
+
+#ifdef CONFIG_MODULE_DECOMPRESS
+int module_decompress(struct load_info *info, const void *buf, size_t size);
+void module_decompress_cleanup(struct load_info *info);
+#else
+static inline int module_decompress(struct load_info *info,
+ const void *buf, size_t size)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void module_decompress_cleanup(struct load_info *info)
+{
+}
+#endif
+
+struct mod_tree_root {
+#ifdef CONFIG_MODULES_TREE_LOOKUP
+ struct latch_tree_root root;
+#endif
+ unsigned long addr_min;
+ unsigned long addr_max;
+};
+
+extern struct mod_tree_root mod_tree;
+extern struct mod_tree_root mod_data_tree;
+
+#ifdef CONFIG_MODULES_TREE_LOOKUP
+void mod_tree_insert(struct module *mod);
+void mod_tree_remove_init(struct module *mod);
+void mod_tree_remove(struct module *mod);
+struct module *mod_find(unsigned long addr, struct mod_tree_root *tree);
+#else /* !CONFIG_MODULES_TREE_LOOKUP */
+
+static inline void mod_tree_insert(struct module *mod) { }
+static inline void mod_tree_remove_init(struct module *mod) { }
+static inline void mod_tree_remove(struct module *mod) { }
+static inline struct module *mod_find(unsigned long addr, struct mod_tree_root *tree)
+{
+ struct module *mod;
+
+ list_for_each_entry_rcu(mod, &modules, list,
+ lockdep_is_held(&module_mutex)) {
+ if (within_module(addr, mod))
+ return mod;
+ }
+
+ return NULL;
+}
+#endif /* CONFIG_MODULES_TREE_LOOKUP */
+
+void module_enable_ro(const struct module *mod, bool after_init);
+void module_enable_nx(const struct module *mod);
+void module_enable_x(const struct module *mod);
+int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
+ char *secstrings, struct module *mod);
+bool module_check_misalignment(const struct module *mod);
+
+#ifdef CONFIG_MODULE_SIG
+int module_sig_check(struct load_info *info, int flags);
+#else /* !CONFIG_MODULE_SIG */
+static inline int module_sig_check(struct load_info *info, int flags)
+{
+ return 0;
+}
+#endif /* !CONFIG_MODULE_SIG */
+
+#ifdef CONFIG_DEBUG_KMEMLEAK
+void kmemleak_load_module(const struct module *mod, const struct load_info *info);
+#else /* !CONFIG_DEBUG_KMEMLEAK */
+static inline void kmemleak_load_module(const struct module *mod,
+ const struct load_info *info) { }
+#endif /* CONFIG_DEBUG_KMEMLEAK */
+
+#ifdef CONFIG_KALLSYMS
+void init_build_id(struct module *mod, const struct load_info *info);
+void layout_symtab(struct module *mod, struct load_info *info);
+void add_kallsyms(struct module *mod, const struct load_info *info);
+unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name);
+
+static inline bool sect_empty(const Elf_Shdr *sect)
+{
+ return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
+}
+#else /* !CONFIG_KALLSYMS */
+static inline void init_build_id(struct module *mod, const struct load_info *info) { }
+static inline void layout_symtab(struct module *mod, struct load_info *info) { }
+static inline void add_kallsyms(struct module *mod, const struct load_info *info) { }
+#endif /* CONFIG_KALLSYMS */
+
+#ifdef CONFIG_SYSFS
+int mod_sysfs_setup(struct module *mod, const struct load_info *info,
+ struct kernel_param *kparam, unsigned int num_params);
+void mod_sysfs_teardown(struct module *mod);
+void init_param_lock(struct module *mod);
+#else /* !CONFIG_SYSFS */
+static inline int mod_sysfs_setup(struct module *mod,
+ const struct load_info *info,
+ struct kernel_param *kparam,
+ unsigned int num_params)
+{
+ return 0;
+}
+
+static inline void mod_sysfs_teardown(struct module *mod) { }
+static inline void init_param_lock(struct module *mod) { }
+#endif /* CONFIG_SYSFS */
+
+#ifdef CONFIG_MODVERSIONS
+int check_version(const struct load_info *info,
+ const char *symname, struct module *mod, const s32 *crc);
+void module_layout(struct module *mod, struct modversion_info *ver, struct kernel_param *kp,
+ struct kernel_symbol *ks, struct tracepoint * const *tp);
+int check_modstruct_version(const struct load_info *info, struct module *mod);
+int same_magic(const char *amagic, const char *bmagic, bool has_crcs);
+#else /* !CONFIG_MODVERSIONS */
+static inline int check_version(const struct load_info *info,
+ const char *symname,
+ struct module *mod,
+ const s32 *crc)
+{
+ return 1;
+}
+
+static inline int check_modstruct_version(const struct load_info *info,
+ struct module *mod)
+{
+ return 1;
+}
+
+static inline int same_magic(const char *amagic, const char *bmagic, bool has_crcs)
+{
+ return strcmp(amagic, bmagic) == 0;
+}
+#endif /* CONFIG_MODVERSIONS */
diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
new file mode 100644
index 000000000000..3e11523bc6f6
--- /dev/null
+++ b/kernel/module/kallsyms.c
@@ -0,0 +1,512 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module kallsyms support
+ *
+ * Copyright (C) 2010 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/buildid.h>
+#include <linux/bsearch.h>
+#include "internal.h"
+
+/* Lookup exported symbol in given range of kernel_symbols */
+static const struct kernel_symbol *lookup_exported_symbol(const char *name,
+ const struct kernel_symbol *start,
+ const struct kernel_symbol *stop)
+{
+ return bsearch(name, start, stop - start,
+ sizeof(struct kernel_symbol), cmp_name);
+}
+
+static int is_exported(const char *name, unsigned long value,
+ const struct module *mod)
+{
+ const struct kernel_symbol *ks;
+
+ if (!mod)
+ ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab);
+ else
+ ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms);
+
+ return ks && kernel_symbol_value(ks) == value;
+}
+
+/* As per nm */
+static char elf_type(const Elf_Sym *sym, const struct load_info *info)
+{
+ const Elf_Shdr *sechdrs = info->sechdrs;
+
+ if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
+ if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
+ return 'v';
+ else
+ return 'w';
+ }
+ if (sym->st_shndx == SHN_UNDEF)
+ return 'U';
+ if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu)
+ return 'a';
+ if (sym->st_shndx >= SHN_LORESERVE)
+ return '?';
+ if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR)
+ return 't';
+ if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC &&
+ sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) {
+ if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE))
+ return 'r';
+ else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
+ return 'g';
+ else
+ return 'd';
+ }
+ if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+ if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
+ return 's';
+ else
+ return 'b';
+ }
+ if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name,
+ ".debug")) {
+ return 'n';
+ }
+ return '?';
+}
+
+static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
+ unsigned int shnum, unsigned int pcpundx)
+{
+ const Elf_Shdr *sec;
+
+ if (src->st_shndx == SHN_UNDEF ||
+ src->st_shndx >= shnum ||
+ !src->st_name)
+ return false;
+
+#ifdef CONFIG_KALLSYMS_ALL
+ if (src->st_shndx == pcpundx)
+ return true;
+#endif
+
+ sec = sechdrs + src->st_shndx;
+ if (!(sec->sh_flags & SHF_ALLOC)
+#ifndef CONFIG_KALLSYMS_ALL
+ || !(sec->sh_flags & SHF_EXECINSTR)
+#endif
+ || (sec->sh_entsize & INIT_OFFSET_MASK))
+ return false;
+
+ return true;
+}
+
+/*
+ * We only allocate and copy the strings needed by the parts of symtab
+ * we keep. This is simple, but has the effect of making multiple
+ * copies of duplicates. We could be more sophisticated, see
+ * linux-kernel thread starting with
+ * <73defb5e4bca04a6431392cc341112b1@localhost>.
+ */
+void layout_symtab(struct module *mod, struct load_info *info)
+{
+ Elf_Shdr *symsect = info->sechdrs + info->index.sym;
+ Elf_Shdr *strsect = info->sechdrs + info->index.str;
+ const Elf_Sym *src;
+ unsigned int i, nsrc, ndst, strtab_size = 0;
+
+ /* Put symbol section at end of init part of module. */
+ symsect->sh_flags |= SHF_ALLOC;
+ symsect->sh_entsize = module_get_offset(mod, &mod->init_layout.size, symsect,
+ info->index.sym) | INIT_OFFSET_MASK;
+ pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
+
+ src = (void *)info->hdr + symsect->sh_offset;
+ nsrc = symsect->sh_size / sizeof(*src);
+
+ /* Compute total space required for the core symbols' strtab. */
+ for (ndst = i = 0; i < nsrc; i++) {
+ if (i == 0 || is_livepatch_module(mod) ||
+ is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum,
+ info->index.pcpu)) {
+ strtab_size += strlen(&info->strtab[src[i].st_name]) + 1;
+ ndst++;
+ }
+ }
+
+ /* Append room for core symbols at end of core part. */
+ info->symoffs = ALIGN(mod->data_layout.size, symsect->sh_addralign ?: 1);
+ info->stroffs = mod->data_layout.size = info->symoffs + ndst * sizeof(Elf_Sym);
+ mod->data_layout.size += strtab_size;
+ info->core_typeoffs = mod->data_layout.size;
+ mod->data_layout.size += ndst * sizeof(char);
+ mod->data_layout.size = strict_align(mod->data_layout.size);
+
+ /* Put string table section at end of init part of module. */
+ strsect->sh_flags |= SHF_ALLOC;
+ strsect->sh_entsize = module_get_offset(mod, &mod->init_layout.size, strsect,
+ info->index.str) | INIT_OFFSET_MASK;
+ pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
+
+ /* We'll tack temporary mod_kallsyms on the end. */
+ mod->init_layout.size = ALIGN(mod->init_layout.size,
+ __alignof__(struct mod_kallsyms));
+ info->mod_kallsyms_init_off = mod->init_layout.size;
+ mod->init_layout.size += sizeof(struct mod_kallsyms);
+ info->init_typeoffs = mod->init_layout.size;
+ mod->init_layout.size += nsrc * sizeof(char);
+ mod->init_layout.size = strict_align(mod->init_layout.size);
+}
+
+/*
+ * We use the full symtab and strtab which layout_symtab arranged to
+ * be appended to the init section. Later we switch to the cut-down
+ * core-only ones.
+ */
+void add_kallsyms(struct module *mod, const struct load_info *info)
+{
+ unsigned int i, ndst;
+ const Elf_Sym *src;
+ Elf_Sym *dst;
+ char *s;
+ Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
+
+ /* Set up to point into init section. */
+ mod->kallsyms = (void __rcu *)mod->init_layout.base +
+ info->mod_kallsyms_init_off;
+
+ preempt_disable();
+ /* The following is safe since this pointer cannot change */
+ rcu_dereference_sched(mod->kallsyms)->symtab = (void *)symsec->sh_addr;
+ rcu_dereference_sched(mod->kallsyms)->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
+ /* Make sure we get permanent strtab: don't use info->strtab. */
+ rcu_dereference_sched(mod->kallsyms)->strtab =
+ (void *)info->sechdrs[info->index.str].sh_addr;
+ rcu_dereference_sched(mod->kallsyms)->typetab = mod->init_layout.base + info->init_typeoffs;
+
+ /*
+ * Now populate the cut down core kallsyms for after init
+ * and set types up while we still have access to sections.
+ */
+ mod->core_kallsyms.symtab = dst = mod->data_layout.base + info->symoffs;
+ mod->core_kallsyms.strtab = s = mod->data_layout.base + info->stroffs;
+ mod->core_kallsyms.typetab = mod->data_layout.base + info->core_typeoffs;
+ src = rcu_dereference_sched(mod->kallsyms)->symtab;
+ for (ndst = i = 0; i < rcu_dereference_sched(mod->kallsyms)->num_symtab; i++) {
+ rcu_dereference_sched(mod->kallsyms)->typetab[i] = elf_type(src + i, info);
+ if (i == 0 || is_livepatch_module(mod) ||
+ is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum,
+ info->index.pcpu)) {
+ mod->core_kallsyms.typetab[ndst] =
+ rcu_dereference_sched(mod->kallsyms)->typetab[i];
+ dst[ndst] = src[i];
+ dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
+ s += strscpy(s,
+ &rcu_dereference_sched(mod->kallsyms)->strtab[src[i].st_name],
+ KSYM_NAME_LEN) + 1;
+ }
+ }
+ preempt_enable();
+ mod->core_kallsyms.num_symtab = ndst;
+}
+
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
+void init_build_id(struct module *mod, const struct load_info *info)
+{
+ const Elf_Shdr *sechdr;
+ unsigned int i;
+
+ for (i = 0; i < info->hdr->e_shnum; i++) {
+ sechdr = &info->sechdrs[i];
+ if (!sect_empty(sechdr) && sechdr->sh_type == SHT_NOTE &&
+ !build_id_parse_buf((void *)sechdr->sh_addr, mod->build_id,
+ sechdr->sh_size))
+ break;
+ }
+}
+#else
+void init_build_id(struct module *mod, const struct load_info *info)
+{
+}
+#endif
+
+/*
+ * This ignores the intensely annoying "mapping symbols" found
+ * in ARM ELF files: $a, $t and $d.
+ */
+static inline int is_arm_mapping_symbol(const char *str)
+{
+ if (str[0] == '.' && str[1] == 'L')
+ return true;
+ return str[0] == '$' && strchr("axtd", str[1]) &&
+ (str[2] == '\0' || str[2] == '.');
+}
+
+static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum)
+{
+ return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
+}
+
+/*
+ * Given a module and address, find the corresponding symbol and return its name
+ * while providing its size and offset if needed.
+ */
+static const char *find_kallsyms_symbol(struct module *mod,
+ unsigned long addr,
+ unsigned long *size,
+ unsigned long *offset)
+{
+ unsigned int i, best = 0;
+ unsigned long nextval, bestval;
+ struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
+
+ /* At worse, next value is at end of module */
+ if (within_module_init(addr, mod))
+ nextval = (unsigned long)mod->init_layout.base + mod->init_layout.text_size;
+ else
+ nextval = (unsigned long)mod->core_layout.base + mod->core_layout.text_size;
+
+ bestval = kallsyms_symbol_value(&kallsyms->symtab[best]);
+
+ /*
+ * Scan for closest preceding symbol, and next symbol. (ELF
+ * starts real symbols at 1).
+ */
+ for (i = 1; i < kallsyms->num_symtab; i++) {
+ const Elf_Sym *sym = &kallsyms->symtab[i];
+ unsigned long thisval = kallsyms_symbol_value(sym);
+
+ if (sym->st_shndx == SHN_UNDEF)
+ continue;
+
+ /*
+ * We ignore unnamed symbols: they're uninformative
+ * and inserted at a whim.
+ */
+ if (*kallsyms_symbol_name(kallsyms, i) == '\0' ||
+ is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i)))
+ continue;
+
+ if (thisval <= addr && thisval > bestval) {
+ best = i;
+ bestval = thisval;
+ }
+ if (thisval > addr && thisval < nextval)
+ nextval = thisval;
+ }
+
+ if (!best)
+ return NULL;
+
+ if (size)
+ *size = nextval - bestval;
+ if (offset)
+ *offset = addr - bestval;
+
+ return kallsyms_symbol_name(kallsyms, best);
+}
+
+void * __weak dereference_module_function_descriptor(struct module *mod,
+ void *ptr)
+{
+ return ptr;
+}
+
+/*
+ * For kallsyms to ask for address resolution. NULL means not found. Careful
+ * not to lock to avoid deadlock on oopses, simply disable preemption.
+ */
+const char *module_address_lookup(unsigned long addr,
+ unsigned long *size,
+ unsigned long *offset,
+ char **modname,
+ const unsigned char **modbuildid,
+ char *namebuf)
+{
+ const char *ret = NULL;
+ struct module *mod;
+
+ preempt_disable();
+ mod = __module_address(addr);
+ if (mod) {
+ if (modname)
+ *modname = mod->name;
+ if (modbuildid) {
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
+ *modbuildid = mod->build_id;
+#else
+ *modbuildid = NULL;
+#endif
+ }
+
+ ret = find_kallsyms_symbol(mod, addr, size, offset);
+ }
+ /* Make a copy in here where it's safe */
+ if (ret) {
+ strncpy(namebuf, ret, KSYM_NAME_LEN - 1);
+ ret = namebuf;
+ }
+ preempt_enable();
+
+ return ret;
+}
+
+int lookup_module_symbol_name(unsigned long addr, char *symname)
+{
+ struct module *mod;
+
+ preempt_disable();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if (within_module(addr, mod)) {
+ const char *sym;
+
+ sym = find_kallsyms_symbol(mod, addr, NULL, NULL);
+ if (!sym)
+ goto out;
+
+ strscpy(symname, sym, KSYM_NAME_LEN);
+ preempt_enable();
+ return 0;
+ }
+ }
+out:
+ preempt_enable();
+ return -ERANGE;
+}
+
+int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
+ unsigned long *offset, char *modname, char *name)
+{
+ struct module *mod;
+
+ preempt_disable();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if (within_module(addr, mod)) {
+ const char *sym;
+
+ sym = find_kallsyms_symbol(mod, addr, size, offset);
+ if (!sym)
+ goto out;
+ if (modname)
+ strscpy(modname, mod->name, MODULE_NAME_LEN);
+ if (name)
+ strscpy(name, sym, KSYM_NAME_LEN);
+ preempt_enable();
+ return 0;
+ }
+ }
+out:
+ preempt_enable();
+ return -ERANGE;
+}
+
+int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+ char *name, char *module_name, int *exported)
+{
+ struct module *mod;
+
+ preempt_disable();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ struct mod_kallsyms *kallsyms;
+
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ kallsyms = rcu_dereference_sched(mod->kallsyms);
+ if (symnum < kallsyms->num_symtab) {
+ const Elf_Sym *sym = &kallsyms->symtab[symnum];
+
+ *value = kallsyms_symbol_value(sym);
+ *type = kallsyms->typetab[symnum];
+ strscpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN);
+ strscpy(module_name, mod->name, MODULE_NAME_LEN);
+ *exported = is_exported(name, *value, mod);
+ preempt_enable();
+ return 0;
+ }
+ symnum -= kallsyms->num_symtab;
+ }
+ preempt_enable();
+ return -ERANGE;
+}
+
+/* Given a module and name of symbol, find and return the symbol's value */
+unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
+{
+ unsigned int i;
+ struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
+
+ for (i = 0; i < kallsyms->num_symtab; i++) {
+ const Elf_Sym *sym = &kallsyms->symtab[i];
+
+ if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 &&
+ sym->st_shndx != SHN_UNDEF)
+ return kallsyms_symbol_value(sym);
+ }
+ return 0;
+}
+
+/* Look for this name: can be of form module:name. */
+unsigned long module_kallsyms_lookup_name(const char *name)
+{
+ struct module *mod;
+ char *colon;
+ unsigned long ret = 0;
+
+ /* Don't lock: we're in enough trouble already. */
+ preempt_disable();
+ if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) {
+ if ((mod = find_module_all(name, colon - name, false)) != NULL)
+ ret = find_kallsyms_symbol_value(mod, colon + 1);
+ } else {
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if ((ret = find_kallsyms_symbol_value(mod, name)) != 0)
+ break;
+ }
+ }
+ preempt_enable();
+ return ret;
+}
+
+#ifdef CONFIG_LIVEPATCH
+int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+ struct module *, unsigned long),
+ void *data)
+{
+ struct module *mod;
+ unsigned int i;
+ int ret = 0;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(mod, &modules, list) {
+ struct mod_kallsyms *kallsyms;
+
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+
+ /* Use rcu_dereference_sched() to remain compliant with the sparse tool */
+ preempt_disable();
+ kallsyms = rcu_dereference_sched(mod->kallsyms);
+ preempt_enable();
+
+ for (i = 0; i < kallsyms->num_symtab; i++) {
+ const Elf_Sym *sym = &kallsyms->symtab[i];
+
+ if (sym->st_shndx == SHN_UNDEF)
+ continue;
+
+ ret = fn(data, kallsyms_symbol_name(kallsyms, i),
+ mod, kallsyms_symbol_value(sym));
+ if (ret != 0)
+ goto out;
+ }
+ }
+out:
+ mutex_unlock(&module_mutex);
+ return ret;
+}
+#endif /* CONFIG_LIVEPATCH */
diff --git a/kernel/module/kdb.c b/kernel/module/kdb.c
new file mode 100644
index 000000000000..f4317f92e189
--- /dev/null
+++ b/kernel/module/kdb.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module kdb support
+ *
+ * Copyright (C) 2010 Jason Wessel
+ */
+
+#include <linux/module.h>
+#include <linux/kdb.h>
+#include "internal.h"
+
+/*
+ * kdb_lsmod - This function implements the 'lsmod' command. Lists
+ * currently loaded kernel modules.
+ * Mostly taken from userland lsmod.
+ */
+int kdb_lsmod(int argc, const char **argv)
+{
+ struct module *mod;
+
+ if (argc != 0)
+ return KDB_ARGCOUNT;
+
+ kdb_printf("Module Size modstruct Used by\n");
+ list_for_each_entry(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+
+ kdb_printf("%-20s%8u", mod->name, mod->core_layout.size);
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ kdb_printf("/%8u", mod->data_layout.size);
+#endif
+ kdb_printf(" 0x%px ", (void *)mod);
+#ifdef CONFIG_MODULE_UNLOAD
+ kdb_printf("%4d ", module_refcount(mod));
+#endif
+ if (mod->state == MODULE_STATE_GOING)
+ kdb_printf(" (Unloading)");
+ else if (mod->state == MODULE_STATE_COMING)
+ kdb_printf(" (Loading)");
+ else
+ kdb_printf(" (Live)");
+ kdb_printf(" 0x%px", mod->core_layout.base);
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ kdb_printf("/0x%px", mod->data_layout.base);
+#endif
+
+#ifdef CONFIG_MODULE_UNLOAD
+ {
+ struct module_use *use;
+
+ kdb_printf(" [ ");
+ list_for_each_entry(use, &mod->source_list,
+ source_list)
+ kdb_printf("%s ", use->target->name);
+ kdb_printf("]\n");
+ }
+#endif
+ }
+
+ return 0;
+}
diff --git a/kernel/module/livepatch.c b/kernel/module/livepatch.c
new file mode 100644
index 000000000000..486d4ff92719
--- /dev/null
+++ b/kernel/module/livepatch.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module livepatch support
+ *
+ * Copyright (C) 2016 Jessica Yu <jeyu@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+/*
+ * Persist Elf information about a module. Copy the Elf header,
+ * section header table, section string table, and symtab section
+ * index from info to mod->klp_info.
+ */
+int copy_module_elf(struct module *mod, struct load_info *info)
+{
+ unsigned int size, symndx;
+ int ret;
+
+ size = sizeof(*mod->klp_info);
+ mod->klp_info = kmalloc(size, GFP_KERNEL);
+ if (!mod->klp_info)
+ return -ENOMEM;
+
+ /* Elf header */
+ size = sizeof(mod->klp_info->hdr);
+ memcpy(&mod->klp_info->hdr, info->hdr, size);
+
+ /* Elf section header table */
+ size = sizeof(*info->sechdrs) * info->hdr->e_shnum;
+ mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL);
+ if (!mod->klp_info->sechdrs) {
+ ret = -ENOMEM;
+ goto free_info;
+ }
+
+ /* Elf section name string table */
+ size = info->sechdrs[info->hdr->e_shstrndx].sh_size;
+ mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL);
+ if (!mod->klp_info->secstrings) {
+ ret = -ENOMEM;
+ goto free_sechdrs;
+ }
+
+ /* Elf symbol section index */
+ symndx = info->index.sym;
+ mod->klp_info->symndx = symndx;
+
+ /*
+ * For livepatch modules, core_kallsyms.symtab is a complete
+ * copy of the original symbol table. Adjust sh_addr to point
+ * to core_kallsyms.symtab since the copy of the symtab in module
+ * init memory is freed at the end of do_init_module().
+ */
+ mod->klp_info->sechdrs[symndx].sh_addr = (unsigned long)mod->core_kallsyms.symtab;
+
+ return 0;
+
+free_sechdrs:
+ kfree(mod->klp_info->sechdrs);
+free_info:
+ kfree(mod->klp_info);
+ return ret;
+}
+
+void free_module_elf(struct module *mod)
+{
+ kfree(mod->klp_info->sechdrs);
+ kfree(mod->klp_info->secstrings);
+ kfree(mod->klp_info);
+}
diff --git a/kernel/module.c b/kernel/module/main.c
index 6cea788fd965..fed58d30725d 100644
--- a/kernel/module.c
+++ b/kernel/module/main.c
@@ -14,16 +14,12 @@
#include <linux/init.h>
#include <linux/kallsyms.h>
#include <linux/buildid.h>
-#include <linux/file.h>
#include <linux/fs.h>
-#include <linux/sysfs.h>
#include <linux/kernel.h>
#include <linux/kernel_read_file.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/elf.h>
-#include <linux/proc_fs.h>
-#include <linux/security.h>
#include <linux/seq_file.h>
#include <linux/syscalls.h>
#include <linux/fcntl.h>
@@ -58,236 +54,69 @@
#include <linux/dynamic_debug.h>
#include <linux/audit.h>
#include <uapi/linux/module.h>
-#include "module-internal.h"
+#include "internal.h"
#define CREATE_TRACE_POINTS
#include <trace/events/module.h>
-#ifndef ARCH_SHF_SMALL
-#define ARCH_SHF_SMALL 0
-#endif
-
-/*
- * Modules' sections will be aligned on page boundaries
- * to ensure complete separation of code and data, but
- * only when CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y
- */
-#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
-# define debug_align(X) ALIGN(X, PAGE_SIZE)
-#else
-# define debug_align(X) (X)
-#endif
-
-/* If this is set, the section belongs in the init part of the module */
-#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
-
/*
* Mutex protects:
* 1) List of modules (also safely readable with preempt_disable),
* 2) module_use links,
- * 3) module_addr_min/module_addr_max.
+ * 3) mod_tree.addr_min/mod_tree.addr_max.
* (delete and add uses RCU list operations).
*/
-static DEFINE_MUTEX(module_mutex);
-static LIST_HEAD(modules);
+DEFINE_MUTEX(module_mutex);
+LIST_HEAD(modules);
/* Work queue for freeing init sections in success case */
static void do_free_init(struct work_struct *w);
static DECLARE_WORK(init_free_wq, do_free_init);
static LLIST_HEAD(init_free_list);
-#ifdef CONFIG_MODULES_TREE_LOOKUP
-
-/*
- * Use a latched RB-tree for __module_address(); this allows us to use
- * RCU-sched lookups of the address from any context.
- *
- * This is conditional on PERF_EVENTS || TRACING because those can really hit
- * __module_address() hard by doing a lot of stack unwinding; potentially from
- * NMI context.
- */
-
-static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
-{
- struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
-
- return (unsigned long)layout->base;
-}
-
-static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n)
-{
- struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
-
- return (unsigned long)layout->size;
-}
-
-static __always_inline bool
-mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b)
-{
- return __mod_tree_val(a) < __mod_tree_val(b);
-}
-
-static __always_inline int
-mod_tree_comp(void *key, struct latch_tree_node *n)
-{
- unsigned long val = (unsigned long)key;
- unsigned long start, end;
-
- start = __mod_tree_val(n);
- if (val < start)
- return -1;
-
- end = start + __mod_tree_size(n);
- if (val >= end)
- return 1;
-
- return 0;
-}
-
-static const struct latch_tree_ops mod_tree_ops = {
- .less = mod_tree_less,
- .comp = mod_tree_comp,
+struct mod_tree_root mod_tree __cacheline_aligned = {
+ .addr_min = -1UL,
};
-static struct mod_tree_root {
- struct latch_tree_root root;
- unsigned long addr_min;
- unsigned long addr_max;
-} mod_tree __cacheline_aligned = {
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+struct mod_tree_root mod_data_tree __cacheline_aligned = {
.addr_min = -1UL,
};
+#endif
#define module_addr_min mod_tree.addr_min
#define module_addr_max mod_tree.addr_max
-static noinline void __mod_tree_insert(struct mod_tree_node *node)
-{
- latch_tree_insert(&node->node, &mod_tree.root, &mod_tree_ops);
-}
-
-static void __mod_tree_remove(struct mod_tree_node *node)
-{
- latch_tree_erase(&node->node, &mod_tree.root, &mod_tree_ops);
-}
-
-/*
- * These modifications: insert, remove_init and remove; are serialized by the
- * module_mutex.
- */
-static void mod_tree_insert(struct module *mod)
-{
- mod->core_layout.mtn.mod = mod;
- mod->init_layout.mtn.mod = mod;
-
- __mod_tree_insert(&mod->core_layout.mtn);
- if (mod->init_layout.size)
- __mod_tree_insert(&mod->init_layout.mtn);
-}
-
-static void mod_tree_remove_init(struct module *mod)
-{
- if (mod->init_layout.size)
- __mod_tree_remove(&mod->init_layout.mtn);
-}
-
-static void mod_tree_remove(struct module *mod)
-{
- __mod_tree_remove(&mod->core_layout.mtn);
- mod_tree_remove_init(mod);
-}
-
-static struct module *mod_find(unsigned long addr)
-{
- struct latch_tree_node *ltn;
-
- ltn = latch_tree_find((void *)addr, &mod_tree.root, &mod_tree_ops);
- if (!ltn)
- return NULL;
-
- return container_of(ltn, struct mod_tree_node, node)->mod;
-}
-
-#else /* MODULES_TREE_LOOKUP */
-
-static unsigned long module_addr_min = -1UL, module_addr_max = 0;
-
-static void mod_tree_insert(struct module *mod) { }
-static void mod_tree_remove_init(struct module *mod) { }
-static void mod_tree_remove(struct module *mod) { }
-
-static struct module *mod_find(unsigned long addr)
-{
- struct module *mod;
-
- list_for_each_entry_rcu(mod, &modules, list,
- lockdep_is_held(&module_mutex)) {
- if (within_module(addr, mod))
- return mod;
- }
-
- return NULL;
-}
-
-#endif /* MODULES_TREE_LOOKUP */
+struct symsearch {
+ const struct kernel_symbol *start, *stop;
+ const s32 *crcs;
+ enum mod_license license;
+};
/*
* Bounds of module text, for speeding up __module_address.
* Protected by module_mutex.
*/
-static void __mod_update_bounds(void *base, unsigned int size)
+static void __mod_update_bounds(void *base, unsigned int size, struct mod_tree_root *tree)
{
unsigned long min = (unsigned long)base;
unsigned long max = min + size;
- if (min < module_addr_min)
- module_addr_min = min;
- if (max > module_addr_max)
- module_addr_max = max;
+ if (min < tree->addr_min)
+ tree->addr_min = min;
+ if (max > tree->addr_max)
+ tree->addr_max = max;
}
static void mod_update_bounds(struct module *mod)
{
- __mod_update_bounds(mod->core_layout.base, mod->core_layout.size);
+ __mod_update_bounds(mod->core_layout.base, mod->core_layout.size, &mod_tree);
if (mod->init_layout.size)
- __mod_update_bounds(mod->init_layout.base, mod->init_layout.size);
-}
-
-#ifdef CONFIG_KGDB_KDB
-struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
-#endif /* CONFIG_KGDB_KDB */
-
-static void module_assert_mutex_or_preempt(void)
-{
-#ifdef CONFIG_LOCKDEP
- if (unlikely(!debug_locks))
- return;
-
- WARN_ON_ONCE(!rcu_read_lock_sched_held() &&
- !lockdep_is_held(&module_mutex));
-#endif
-}
-
-#ifdef CONFIG_MODULE_SIG
-static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE);
-module_param(sig_enforce, bool_enable_only, 0644);
-
-void set_module_sig_enforced(void)
-{
- sig_enforce = true;
-}
-#else
-#define sig_enforce false
+ __mod_update_bounds(mod->init_layout.base, mod->init_layout.size, &mod_tree);
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ __mod_update_bounds(mod->data_layout.base, mod->data_layout.size, &mod_data_tree);
#endif
-
-/*
- * Export sig_enforce kernel cmdline parameter to allow other subsystems rely
- * on that instead of directly to CONFIG_MODULE_SIG_FORCE config.
- */
-bool is_module_sig_enforced(void)
-{
- return sig_enforce;
}
-EXPORT_SYMBOL(is_module_sig_enforced);
/* Block module loading/unloading? */
int modules_disabled = 0;
@@ -408,66 +237,12 @@ static __maybe_unused void *any_section_objs(const struct load_info *info,
return (void *)info->sechdrs[sec].sh_addr;
}
-/* Provided by the linker */
-extern const struct kernel_symbol __start___ksymtab[];
-extern const struct kernel_symbol __stop___ksymtab[];
-extern const struct kernel_symbol __start___ksymtab_gpl[];
-extern const struct kernel_symbol __stop___ksymtab_gpl[];
-extern const s32 __start___kcrctab[];
-extern const s32 __start___kcrctab_gpl[];
-
#ifndef CONFIG_MODVERSIONS
#define symversion(base, idx) NULL
#else
#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
#endif
-struct symsearch {
- const struct kernel_symbol *start, *stop;
- const s32 *crcs;
- enum mod_license {
- NOT_GPL_ONLY,
- GPL_ONLY,
- } license;
-};
-
-struct find_symbol_arg {
- /* Input */
- const char *name;
- bool gplok;
- bool warn;
-
- /* Output */
- struct module *owner;
- const s32 *crc;
- const struct kernel_symbol *sym;
- enum mod_license license;
-};
-
-static bool check_exported_symbol(const struct symsearch *syms,
- struct module *owner,
- unsigned int symnum, void *data)
-{
- struct find_symbol_arg *fsa = data;
-
- if (!fsa->gplok && syms->license == GPL_ONLY)
- return false;
- fsa->owner = owner;
- fsa->crc = symversion(syms->crcs, symnum);
- fsa->sym = &syms->start[symnum];
- fsa->license = syms->license;
- return true;
-}
-
-static unsigned long kernel_symbol_value(const struct kernel_symbol *sym)
-{
-#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
- return (unsigned long)offset_to_ptr(&sym->value_offset);
-#else
- return sym->value;
-#endif
-}
-
static const char *kernel_symbol_name(const struct kernel_symbol *sym)
{
#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
@@ -488,33 +263,38 @@ static const char *kernel_symbol_namespace(const struct kernel_symbol *sym)
#endif
}
-static int cmp_name(const void *name, const void *sym)
+int cmp_name(const void *name, const void *sym)
{
return strcmp(name, kernel_symbol_name(sym));
}
static bool find_exported_symbol_in_section(const struct symsearch *syms,
struct module *owner,
- void *data)
+ struct find_symbol_arg *fsa)
{
- struct find_symbol_arg *fsa = data;
struct kernel_symbol *sym;
+ if (!fsa->gplok && syms->license == GPL_ONLY)
+ return false;
+
sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
sizeof(struct kernel_symbol), cmp_name);
+ if (!sym)
+ return false;
- if (sym != NULL && check_exported_symbol(syms, owner,
- sym - syms->start, data))
- return true;
+ fsa->owner = owner;
+ fsa->crc = symversion(syms->crcs, sym - syms->start);
+ fsa->sym = sym;
+ fsa->license = syms->license;
- return false;
+ return true;
}
/*
* Find an exported symbol and return it, along with, (optional) crc and
* (optional) module which owns it. Needs preempt disabled or module_mutex.
*/
-static bool find_symbol(struct find_symbol_arg *fsa)
+bool find_symbol(struct find_symbol_arg *fsa)
{
static const struct symsearch arr[] = {
{ __start___ksymtab, __stop___ksymtab, __start___kcrctab,
@@ -558,8 +338,8 @@ static bool find_symbol(struct find_symbol_arg *fsa)
* Search for module by name: must hold module_mutex (or preempt disabled
* for read-only access).
*/
-static struct module *find_module_all(const char *name, size_t len,
- bool even_unformed)
+struct module *find_module_all(const char *name, size_t len,
+ bool even_unformed)
{
struct module *mod;
@@ -985,31 +765,6 @@ out:
return ret;
}
-static inline void print_unload_info(struct seq_file *m, struct module *mod)
-{
- struct module_use *use;
- int printed_something = 0;
-
- seq_printf(m, " %i ", module_refcount(mod));
-
- /*
- * Always include a trailing , so userspace can differentiate
- * between this and the old multi-field proc format.
- */
- list_for_each_entry(use, &mod->source_list, source_list) {
- printed_something = 1;
- seq_printf(m, "%s,", use->source->name);
- }
-
- if (mod->init != NULL && mod->exit == NULL) {
- printed_something = 1;
- seq_puts(m, "[permanent],");
- }
-
- if (!printed_something)
- seq_puts(m, "-");
-}
-
void __symbol_put(const char *symbol)
{
struct find_symbol_arg fsa = {
@@ -1099,12 +854,6 @@ void module_put(struct module *module)
EXPORT_SYMBOL(module_put);
#else /* !CONFIG_MODULE_UNLOAD */
-static inline void print_unload_info(struct seq_file *m, struct module *mod)
-{
- /* We don't know the usage count, or what modules are using. */
- seq_puts(m, " - -");
-}
-
static inline void module_unload_free(struct module *mod)
{
}
@@ -1120,13 +869,13 @@ static inline int module_unload_init(struct module *mod)
}
#endif /* CONFIG_MODULE_UNLOAD */
-static size_t module_flags_taint(struct module *mod, char *buf)
+size_t module_flags_taint(unsigned long taints, char *buf)
{
size_t l = 0;
int i;
for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
- if (taint_flags[i].module && test_bit(i, &mod->taints))
+ if (taint_flags[i].module && test_bit(i, &taints))
buf[l++] = taint_flags[i].c_true;
}
@@ -1179,6 +928,17 @@ static ssize_t show_coresize(struct module_attribute *mattr,
static struct module_attribute modinfo_coresize =
__ATTR(coresize, 0444, show_coresize, NULL);
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+static ssize_t show_datasize(struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ return sprintf(buffer, "%u\n", mk->mod->data_layout.size);
+}
+
+static struct module_attribute modinfo_datasize =
+ __ATTR(datasize, 0444, show_datasize, NULL);
+#endif
+
static ssize_t show_initsize(struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
@@ -1193,7 +953,7 @@ static ssize_t show_taint(struct module_attribute *mattr,
{
size_t l;
- l = module_flags_taint(mk->mod, buffer);
+ l = module_flags_taint(mk->mod->taints, buffer);
buffer[l++] = '\n';
return l;
}
@@ -1201,12 +961,15 @@ static ssize_t show_taint(struct module_attribute *mattr,
static struct module_attribute modinfo_taint =
__ATTR(taint, 0444, show_taint, NULL);
-static struct module_attribute *modinfo_attrs[] = {
+struct module_attribute *modinfo_attrs[] = {
&module_uevent,
&modinfo_version,
&modinfo_srcversion,
&modinfo_initstate,
&modinfo_coresize,
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ &modinfo_datasize,
+#endif
&modinfo_initsize,
&modinfo_taint,
#ifdef CONFIG_MODULE_UNLOAD
@@ -1215,9 +978,11 @@ static struct module_attribute *modinfo_attrs[] = {
NULL,
};
+size_t modinfo_attrs_count = ARRAY_SIZE(modinfo_attrs);
+
static const char vermagic[] = VERMAGIC_STRING;
-static int try_to_force_load(struct module *mod, const char *reason)
+int try_to_force_load(struct module *mod, const char *reason)
{
#ifdef CONFIG_MODULE_FORCE_LOAD
if (!test_taint(TAINT_FORCED_MODULE))
@@ -1229,115 +994,6 @@ static int try_to_force_load(struct module *mod, const char *reason)
#endif
}
-#ifdef CONFIG_MODVERSIONS
-
-static u32 resolve_rel_crc(const s32 *crc)
-{
- return *(u32 *)((void *)crc + *crc);
-}
-
-static int check_version(const struct load_info *info,
- const char *symname,
- struct module *mod,
- const s32 *crc)
-{
- Elf_Shdr *sechdrs = info->sechdrs;
- unsigned int versindex = info->index.vers;
- unsigned int i, num_versions;
- struct modversion_info *versions;
-
- /* Exporting module didn't supply crcs? OK, we're already tainted. */
- if (!crc)
- return 1;
-
- /* No versions at all? modprobe --force does this. */
- if (versindex == 0)
- return try_to_force_load(mod, symname) == 0;
-
- versions = (void *) sechdrs[versindex].sh_addr;
- num_versions = sechdrs[versindex].sh_size
- / sizeof(struct modversion_info);
-
- for (i = 0; i < num_versions; i++) {
- u32 crcval;
-
- if (strcmp(versions[i].name, symname) != 0)
- continue;
-
- if (IS_ENABLED(CONFIG_MODULE_REL_CRCS))
- crcval = resolve_rel_crc(crc);
- else
- crcval = *crc;
- if (versions[i].crc == crcval)
- return 1;
- pr_debug("Found checksum %X vs module %lX\n",
- crcval, versions[i].crc);
- goto bad_version;
- }
-
- /* Broken toolchain. Warn once, then let it go.. */
- pr_warn_once("%s: no symbol version for %s\n", info->name, symname);
- return 1;
-
-bad_version:
- pr_warn("%s: disagrees about version of symbol %s\n",
- info->name, symname);
- return 0;
-}
-
-static inline int check_modstruct_version(const struct load_info *info,
- struct module *mod)
-{
- struct find_symbol_arg fsa = {
- .name = "module_layout",
- .gplok = true,
- };
-
- /*
- * Since this should be found in kernel (which can't be removed), no
- * locking is necessary -- use preempt_disable() to placate lockdep.
- */
- preempt_disable();
- if (!find_symbol(&fsa)) {
- preempt_enable();
- BUG();
- }
- preempt_enable();
- return check_version(info, "module_layout", mod, fsa.crc);
-}
-
-/* First part is kernel version, which we ignore if module has crcs. */
-static inline int same_magic(const char *amagic, const char *bmagic,
- bool has_crcs)
-{
- if (has_crcs) {
- amagic += strcspn(amagic, " ");
- bmagic += strcspn(bmagic, " ");
- }
- return strcmp(amagic, bmagic) == 0;
-}
-#else
-static inline int check_version(const struct load_info *info,
- const char *symname,
- struct module *mod,
- const s32 *crc)
-{
- return 1;
-}
-
-static inline int check_modstruct_version(const struct load_info *info,
- struct module *mod)
-{
- return 1;
-}
-
-static inline int same_magic(const char *amagic, const char *bmagic,
- bool has_crcs)
-{
- return strcmp(amagic, bmagic) == 0;
-}
-#endif /* CONFIG_MODVERSIONS */
-
static char *get_modinfo(const struct load_info *info, const char *tag);
static char *get_next_modinfo(const struct load_info *info, const char *tag,
char *prev);
@@ -1372,20 +1028,20 @@ static int verify_namespace_is_imported(const struct load_info *info,
return 0;
}
-static bool inherit_taint(struct module *mod, struct module *owner)
+static bool inherit_taint(struct module *mod, struct module *owner, const char *name)
{
if (!owner || !test_bit(TAINT_PROPRIETARY_MODULE, &owner->taints))
return true;
if (mod->using_gplonly_symbols) {
- pr_err("%s: module using GPL-only symbols uses symbols from proprietary module %s.\n",
- mod->name, owner->name);
+ pr_err("%s: module using GPL-only symbols uses symbols %s from proprietary module %s.\n",
+ mod->name, name, owner->name);
return false;
}
if (!test_bit(TAINT_PROPRIETARY_MODULE, &mod->taints)) {
- pr_warn("%s: module uses symbols from proprietary module %s, inheriting taint.\n",
- mod->name, owner->name);
+ pr_warn("%s: module uses symbols %s from proprietary module %s, inheriting taint.\n",
+ mod->name, name, owner->name);
set_bit(TAINT_PROPRIETARY_MODULE, &mod->taints);
}
return true;
@@ -1417,7 +1073,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
if (fsa.license == GPL_ONLY)
mod->using_gplonly_symbols = true;
- if (!inherit_taint(mod, fsa.owner)) {
+ if (!inherit_taint(mod, fsa.owner, name)) {
fsa.sym = NULL;
goto getname;
}
@@ -1465,674 +1121,6 @@ resolve_symbol_wait(struct module *mod,
return ksym;
}
-#ifdef CONFIG_KALLSYMS
-static inline bool sect_empty(const Elf_Shdr *sect)
-{
- return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
-}
-#endif
-
-/*
- * /sys/module/foo/sections stuff
- * J. Corbet <corbet@lwn.net>
- */
-#ifdef CONFIG_SYSFS
-
-#ifdef CONFIG_KALLSYMS
-struct module_sect_attr {
- struct bin_attribute battr;
- unsigned long address;
-};
-
-struct module_sect_attrs {
- struct attribute_group grp;
- unsigned int nsections;
- struct module_sect_attr attrs[];
-};
-
-#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4))
-static ssize_t module_sect_read(struct file *file, struct kobject *kobj,
- struct bin_attribute *battr,
- char *buf, loff_t pos, size_t count)
-{
- struct module_sect_attr *sattr =
- container_of(battr, struct module_sect_attr, battr);
- char bounce[MODULE_SECT_READ_SIZE + 1];
- size_t wrote;
-
- if (pos != 0)
- return -EINVAL;
-
- /*
- * Since we're a binary read handler, we must account for the
- * trailing NUL byte that sprintf will write: if "buf" is
- * too small to hold the NUL, or the NUL is exactly the last
- * byte, the read will look like it got truncated by one byte.
- * Since there is no way to ask sprintf nicely to not write
- * the NUL, we have to use a bounce buffer.
- */
- wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n",
- kallsyms_show_value(file->f_cred)
- ? (void *)sattr->address : NULL);
- count = min(count, wrote);
- memcpy(buf, bounce, count);
-
- return count;
-}
-
-static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
-{
- unsigned int section;
-
- for (section = 0; section < sect_attrs->nsections; section++)
- kfree(sect_attrs->attrs[section].battr.attr.name);
- kfree(sect_attrs);
-}
-
-static void add_sect_attrs(struct module *mod, const struct load_info *info)
-{
- unsigned int nloaded = 0, i, size[2];
- struct module_sect_attrs *sect_attrs;
- struct module_sect_attr *sattr;
- struct bin_attribute **gattr;
-
- /* Count loaded sections and allocate structures */
- for (i = 0; i < info->hdr->e_shnum; i++)
- if (!sect_empty(&info->sechdrs[i]))
- nloaded++;
- size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded),
- sizeof(sect_attrs->grp.bin_attrs[0]));
- size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]);
- sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
- if (sect_attrs == NULL)
- return;
-
- /* Setup section attributes. */
- sect_attrs->grp.name = "sections";
- sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0];
-
- sect_attrs->nsections = 0;
- sattr = &sect_attrs->attrs[0];
- gattr = &sect_attrs->grp.bin_attrs[0];
- for (i = 0; i < info->hdr->e_shnum; i++) {
- Elf_Shdr *sec = &info->sechdrs[i];
- if (sect_empty(sec))
- continue;
- sysfs_bin_attr_init(&sattr->battr);
- sattr->address = sec->sh_addr;
- sattr->battr.attr.name =
- kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL);
- if (sattr->battr.attr.name == NULL)
- goto out;
- sect_attrs->nsections++;
- sattr->battr.read = module_sect_read;
- sattr->battr.size = MODULE_SECT_READ_SIZE;
- sattr->battr.attr.mode = 0400;
- *(gattr++) = &(sattr++)->battr;
- }
- *gattr = NULL;
-
- if (sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp))
- goto out;
-
- mod->sect_attrs = sect_attrs;
- return;
- out:
- free_sect_attrs(sect_attrs);
-}
-
-static void remove_sect_attrs(struct module *mod)
-{
- if (mod->sect_attrs) {
- sysfs_remove_group(&mod->mkobj.kobj,
- &mod->sect_attrs->grp);
- /*
- * We are positive that no one is using any sect attrs
- * at this point. Deallocate immediately.
- */
- free_sect_attrs(mod->sect_attrs);
- mod->sect_attrs = NULL;
- }
-}
-
-/*
- * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections.
- */
-
-struct module_notes_attrs {
- struct kobject *dir;
- unsigned int notes;
- struct bin_attribute attrs[];
-};
-
-static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
- struct bin_attribute *bin_attr,
- char *buf, loff_t pos, size_t count)
-{
- /*
- * The caller checked the pos and count against our size.
- */
- memcpy(buf, bin_attr->private + pos, count);
- return count;
-}
-
-static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
- unsigned int i)
-{
- if (notes_attrs->dir) {
- while (i-- > 0)
- sysfs_remove_bin_file(notes_attrs->dir,
- &notes_attrs->attrs[i]);
- kobject_put(notes_attrs->dir);
- }
- kfree(notes_attrs);
-}
-
-static void add_notes_attrs(struct module *mod, const struct load_info *info)
-{
- unsigned int notes, loaded, i;
- struct module_notes_attrs *notes_attrs;
- struct bin_attribute *nattr;
-
- /* failed to create section attributes, so can't create notes */
- if (!mod->sect_attrs)
- return;
-
- /* Count notes sections and allocate structures. */
- notes = 0;
- for (i = 0; i < info->hdr->e_shnum; i++)
- if (!sect_empty(&info->sechdrs[i]) &&
- (info->sechdrs[i].sh_type == SHT_NOTE))
- ++notes;
-
- if (notes == 0)
- return;
-
- notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes),
- GFP_KERNEL);
- if (notes_attrs == NULL)
- return;
-
- notes_attrs->notes = notes;
- nattr = &notes_attrs->attrs[0];
- for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
- if (sect_empty(&info->sechdrs[i]))
- continue;
- if (info->sechdrs[i].sh_type == SHT_NOTE) {
- sysfs_bin_attr_init(nattr);
- nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name;
- nattr->attr.mode = S_IRUGO;
- nattr->size = info->sechdrs[i].sh_size;
- nattr->private = (void *) info->sechdrs[i].sh_addr;
- nattr->read = module_notes_read;
- ++nattr;
- }
- ++loaded;
- }
-
- notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj);
- if (!notes_attrs->dir)
- goto out;
-
- for (i = 0; i < notes; ++i)
- if (sysfs_create_bin_file(notes_attrs->dir,
- &notes_attrs->attrs[i]))
- goto out;
-
- mod->notes_attrs = notes_attrs;
- return;
-
- out:
- free_notes_attrs(notes_attrs, i);
-}
-
-static void remove_notes_attrs(struct module *mod)
-{
- if (mod->notes_attrs)
- free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes);
-}
-
-#else
-
-static inline void add_sect_attrs(struct module *mod,
- const struct load_info *info)
-{
-}
-
-static inline void remove_sect_attrs(struct module *mod)
-{
-}
-
-static inline void add_notes_attrs(struct module *mod,
- const struct load_info *info)
-{
-}
-
-static inline void remove_notes_attrs(struct module *mod)
-{
-}
-#endif /* CONFIG_KALLSYMS */
-
-static void del_usage_links(struct module *mod)
-{
-#ifdef CONFIG_MODULE_UNLOAD
- struct module_use *use;
-
- mutex_lock(&module_mutex);
- list_for_each_entry(use, &mod->target_list, target_list)
- sysfs_remove_link(use->target->holders_dir, mod->name);
- mutex_unlock(&module_mutex);
-#endif
-}
-
-static int add_usage_links(struct module *mod)
-{
- int ret = 0;
-#ifdef CONFIG_MODULE_UNLOAD
- struct module_use *use;
-
- mutex_lock(&module_mutex);
- list_for_each_entry(use, &mod->target_list, target_list) {
- ret = sysfs_create_link(use->target->holders_dir,
- &mod->mkobj.kobj, mod->name);
- if (ret)
- break;
- }
- mutex_unlock(&module_mutex);
- if (ret)
- del_usage_links(mod);
-#endif
- return ret;
-}
-
-static void module_remove_modinfo_attrs(struct module *mod, int end);
-
-static int module_add_modinfo_attrs(struct module *mod)
-{
- struct module_attribute *attr;
- struct module_attribute *temp_attr;
- int error = 0;
- int i;
-
- mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) *
- (ARRAY_SIZE(modinfo_attrs) + 1)),
- GFP_KERNEL);
- if (!mod->modinfo_attrs)
- return -ENOMEM;
-
- temp_attr = mod->modinfo_attrs;
- for (i = 0; (attr = modinfo_attrs[i]); i++) {
- if (!attr->test || attr->test(mod)) {
- memcpy(temp_attr, attr, sizeof(*temp_attr));
- sysfs_attr_init(&temp_attr->attr);
- error = sysfs_create_file(&mod->mkobj.kobj,
- &temp_attr->attr);
- if (error)
- goto error_out;
- ++temp_attr;
- }
- }
-
- return 0;
-
-error_out:
- if (i > 0)
- module_remove_modinfo_attrs(mod, --i);
- else
- kfree(mod->modinfo_attrs);
- return error;
-}
-
-static void module_remove_modinfo_attrs(struct module *mod, int end)
-{
- struct module_attribute *attr;
- int i;
-
- for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
- if (end >= 0 && i > end)
- break;
- /* pick a field to test for end of list */
- if (!attr->attr.name)
- break;
- sysfs_remove_file(&mod->mkobj.kobj, &attr->attr);
- if (attr->free)
- attr->free(mod);
- }
- kfree(mod->modinfo_attrs);
-}
-
-static void mod_kobject_put(struct module *mod)
-{
- DECLARE_COMPLETION_ONSTACK(c);
- mod->mkobj.kobj_completion = &c;
- kobject_put(&mod->mkobj.kobj);
- wait_for_completion(&c);
-}
-
-static int mod_sysfs_init(struct module *mod)
-{
- int err;
- struct kobject *kobj;
-
- if (!module_sysfs_initialized) {
- pr_err("%s: module sysfs not initialized\n", mod->name);
- err = -EINVAL;
- goto out;
- }
-
- kobj = kset_find_obj(module_kset, mod->name);
- if (kobj) {
- pr_err("%s: module is already loaded\n", mod->name);
- kobject_put(kobj);
- err = -EINVAL;
- goto out;
- }
-
- mod->mkobj.mod = mod;
-
- memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
- mod->mkobj.kobj.kset = module_kset;
- err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
- "%s", mod->name);
- if (err)
- mod_kobject_put(mod);
-
-out:
- return err;
-}
-
-static int mod_sysfs_setup(struct module *mod,
- const struct load_info *info,
- struct kernel_param *kparam,
- unsigned int num_params)
-{
- int err;
-
- err = mod_sysfs_init(mod);
- if (err)
- goto out;
-
- mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
- if (!mod->holders_dir) {
- err = -ENOMEM;
- goto out_unreg;
- }
-
- err = module_param_sysfs_setup(mod, kparam, num_params);
- if (err)
- goto out_unreg_holders;
-
- err = module_add_modinfo_attrs(mod);
- if (err)
- goto out_unreg_param;
-
- err = add_usage_links(mod);
- if (err)
- goto out_unreg_modinfo_attrs;
-
- add_sect_attrs(mod, info);
- add_notes_attrs(mod, info);
-
- return 0;
-
-out_unreg_modinfo_attrs:
- module_remove_modinfo_attrs(mod, -1);
-out_unreg_param:
- module_param_sysfs_remove(mod);
-out_unreg_holders:
- kobject_put(mod->holders_dir);
-out_unreg:
- mod_kobject_put(mod);
-out:
- return err;
-}
-
-static void mod_sysfs_fini(struct module *mod)
-{
- remove_notes_attrs(mod);
- remove_sect_attrs(mod);
- mod_kobject_put(mod);
-}
-
-static void init_param_lock(struct module *mod)
-{
- mutex_init(&mod->param_lock);
-}
-#else /* !CONFIG_SYSFS */
-
-static int mod_sysfs_setup(struct module *mod,
- const struct load_info *info,
- struct kernel_param *kparam,
- unsigned int num_params)
-{
- return 0;
-}
-
-static void mod_sysfs_fini(struct module *mod)
-{
-}
-
-static void module_remove_modinfo_attrs(struct module *mod, int end)
-{
-}
-
-static void del_usage_links(struct module *mod)
-{
-}
-
-static void init_param_lock(struct module *mod)
-{
-}
-#endif /* CONFIG_SYSFS */
-
-static void mod_sysfs_teardown(struct module *mod)
-{
- del_usage_links(mod);
- module_remove_modinfo_attrs(mod, -1);
- module_param_sysfs_remove(mod);
- kobject_put(mod->mkobj.drivers_dir);
- kobject_put(mod->holders_dir);
- mod_sysfs_fini(mod);
-}
-
-/*
- * LKM RO/NX protection: protect module's text/ro-data
- * from modification and any data from execution.
- *
- * General layout of module is:
- * [text] [read-only-data] [ro-after-init] [writable data]
- * text_size -----^ ^ ^ ^
- * ro_size ------------------------| | |
- * ro_after_init_size -----------------------------| |
- * size -----------------------------------------------------------|
- *
- * These values are always page-aligned (as is base)
- */
-
-/*
- * Since some arches are moving towards PAGE_KERNEL module allocations instead
- * of PAGE_KERNEL_EXEC, keep frob_text() and module_enable_x() outside of the
- * CONFIG_STRICT_MODULE_RWX block below because they are needed regardless of
- * whether we are strict.
- */
-#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
-static void frob_text(const struct module_layout *layout,
- int (*set_memory)(unsigned long start, int num_pages))
-{
- BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
- set_memory((unsigned long)layout->base,
- layout->text_size >> PAGE_SHIFT);
-}
-
-static void module_enable_x(const struct module *mod)
-{
- frob_text(&mod->core_layout, set_memory_x);
- frob_text(&mod->init_layout, set_memory_x);
-}
-#else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
-static void module_enable_x(const struct module *mod) { }
-#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
-
-#ifdef CONFIG_STRICT_MODULE_RWX
-static void frob_rodata(const struct module_layout *layout,
- int (*set_memory)(unsigned long start, int num_pages))
-{
- BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
- set_memory((unsigned long)layout->base + layout->text_size,
- (layout->ro_size - layout->text_size) >> PAGE_SHIFT);
-}
-
-static void frob_ro_after_init(const struct module_layout *layout,
- int (*set_memory)(unsigned long start, int num_pages))
-{
- BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
- set_memory((unsigned long)layout->base + layout->ro_size,
- (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT);
-}
-
-static void frob_writable_data(const struct module_layout *layout,
- int (*set_memory)(unsigned long start, int num_pages))
-{
- BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
- BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1));
- set_memory((unsigned long)layout->base + layout->ro_after_init_size,
- (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT);
-}
-
-static void module_enable_ro(const struct module *mod, bool after_init)
-{
- if (!rodata_enabled)
- return;
-
- set_vm_flush_reset_perms(mod->core_layout.base);
- set_vm_flush_reset_perms(mod->init_layout.base);
- frob_text(&mod->core_layout, set_memory_ro);
-
- frob_rodata(&mod->core_layout, set_memory_ro);
- frob_text(&mod->init_layout, set_memory_ro);
- frob_rodata(&mod->init_layout, set_memory_ro);
-
- if (after_init)
- frob_ro_after_init(&mod->core_layout, set_memory_ro);
-}
-
-static void module_enable_nx(const struct module *mod)
-{
- frob_rodata(&mod->core_layout, set_memory_nx);
- frob_ro_after_init(&mod->core_layout, set_memory_nx);
- frob_writable_data(&mod->core_layout, set_memory_nx);
- frob_rodata(&mod->init_layout, set_memory_nx);
- frob_writable_data(&mod->init_layout, set_memory_nx);
-}
-
-static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
- char *secstrings, struct module *mod)
-{
- const unsigned long shf_wx = SHF_WRITE|SHF_EXECINSTR;
- int i;
-
- for (i = 0; i < hdr->e_shnum; i++) {
- if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) {
- pr_err("%s: section %s (index %d) has invalid WRITE|EXEC flags\n",
- mod->name, secstrings + sechdrs[i].sh_name, i);
- return -ENOEXEC;
- }
- }
-
- return 0;
-}
-
-#else /* !CONFIG_STRICT_MODULE_RWX */
-static void module_enable_nx(const struct module *mod) { }
-static void module_enable_ro(const struct module *mod, bool after_init) {}
-static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
- char *secstrings, struct module *mod)
-{
- return 0;
-}
-#endif /* CONFIG_STRICT_MODULE_RWX */
-
-#ifdef CONFIG_LIVEPATCH
-/*
- * Persist Elf information about a module. Copy the Elf header,
- * section header table, section string table, and symtab section
- * index from info to mod->klp_info.
- */
-static int copy_module_elf(struct module *mod, struct load_info *info)
-{
- unsigned int size, symndx;
- int ret;
-
- size = sizeof(*mod->klp_info);
- mod->klp_info = kmalloc(size, GFP_KERNEL);
- if (mod->klp_info == NULL)
- return -ENOMEM;
-
- /* Elf header */
- size = sizeof(mod->klp_info->hdr);
- memcpy(&mod->klp_info->hdr, info->hdr, size);
-
- /* Elf section header table */
- size = sizeof(*info->sechdrs) * info->hdr->e_shnum;
- mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL);
- if (mod->klp_info->sechdrs == NULL) {
- ret = -ENOMEM;
- goto free_info;
- }
-
- /* Elf section name string table */
- size = info->sechdrs[info->hdr->e_shstrndx].sh_size;
- mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL);
- if (mod->klp_info->secstrings == NULL) {
- ret = -ENOMEM;
- goto free_sechdrs;
- }
-
- /* Elf symbol section index */
- symndx = info->index.sym;
- mod->klp_info->symndx = symndx;
-
- /*
- * For livepatch modules, core_kallsyms.symtab is a complete
- * copy of the original symbol table. Adjust sh_addr to point
- * to core_kallsyms.symtab since the copy of the symtab in module
- * init memory is freed at the end of do_init_module().
- */
- mod->klp_info->sechdrs[symndx].sh_addr = \
- (unsigned long) mod->core_kallsyms.symtab;
-
- return 0;
-
-free_sechdrs:
- kfree(mod->klp_info->sechdrs);
-free_info:
- kfree(mod->klp_info);
- return ret;
-}
-
-static void free_module_elf(struct module *mod)
-{
- kfree(mod->klp_info->sechdrs);
- kfree(mod->klp_info->secstrings);
- kfree(mod->klp_info);
-}
-#else /* !CONFIG_LIVEPATCH */
-static int copy_module_elf(struct module *mod, struct load_info *info)
-{
- return 0;
-}
-
-static void free_module_elf(struct module *mod)
-{
-}
-#endif /* CONFIG_LIVEPATCH */
-
void __weak module_memfree(void *module_region)
{
/*
@@ -2192,6 +1180,9 @@ static void free_module(struct module *mod)
module_bug_cleanup(mod);
/* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */
synchronize_rcu();
+ if (try_add_tainted_module(mod))
+ pr_err("%s: adding tainted module to the unloaded tainted modules list failed.\n",
+ mod->name);
mutex_unlock(&module_mutex);
/* Clean up CFI for the module. */
@@ -2204,10 +1195,13 @@ static void free_module(struct module *mod)
percpu_modfree(mod);
/* Free lock-classes; relies on the preceding sync_rcu(). */
- lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
+ lockdep_free_key_range(mod->data_layout.base, mod->data_layout.size);
/* Finally, free the core (containing the module structure) */
module_memfree(mod->core_layout.base);
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ vfree(mod->data_layout.base);
+#endif
}
void *__symbol_get(const char *symbol)
@@ -2395,7 +1389,7 @@ unsigned int __weak arch_mod_section_prepend(struct module *mod,
}
/* Update size with this section: return offset. */
-static long get_offset(struct module *mod, unsigned int *size,
+long module_get_offset(struct module *mod, unsigned int *size,
Elf_Shdr *sechdr, unsigned int section)
{
long ret;
@@ -2445,30 +1439,32 @@ static void layout_sections(struct module *mod, struct load_info *info)
for (i = 0; i < info->hdr->e_shnum; ++i) {
Elf_Shdr *s = &info->sechdrs[i];
const char *sname = info->secstrings + s->sh_name;
+ unsigned int *sizep;
if ((s->sh_flags & masks[m][0]) != masks[m][0]
|| (s->sh_flags & masks[m][1])
|| s->sh_entsize != ~0UL
|| module_init_layout_section(sname))
continue;
- s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i);
+ sizep = m ? &mod->data_layout.size : &mod->core_layout.size;
+ s->sh_entsize = module_get_offset(mod, sizep, s, i);
pr_debug("\t%s\n", sname);
}
switch (m) {
case 0: /* executable */
- mod->core_layout.size = debug_align(mod->core_layout.size);
+ mod->core_layout.size = strict_align(mod->core_layout.size);
mod->core_layout.text_size = mod->core_layout.size;
break;
case 1: /* RO: text and ro-data */
- mod->core_layout.size = debug_align(mod->core_layout.size);
- mod->core_layout.ro_size = mod->core_layout.size;
+ mod->data_layout.size = strict_align(mod->data_layout.size);
+ mod->data_layout.ro_size = mod->data_layout.size;
break;
case 2: /* RO after init */
- mod->core_layout.size = debug_align(mod->core_layout.size);
- mod->core_layout.ro_after_init_size = mod->core_layout.size;
+ mod->data_layout.size = strict_align(mod->data_layout.size);
+ mod->data_layout.ro_after_init_size = mod->data_layout.size;
break;
case 4: /* whole core */
- mod->core_layout.size = debug_align(mod->core_layout.size);
+ mod->data_layout.size = strict_align(mod->data_layout.size);
break;
}
}
@@ -2484,17 +1480,17 @@ static void layout_sections(struct module *mod, struct load_info *info)
|| s->sh_entsize != ~0UL
|| !module_init_layout_section(sname))
continue;
- s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i)
+ s->sh_entsize = (module_get_offset(mod, &mod->init_layout.size, s, i)
| INIT_OFFSET_MASK);
pr_debug("\t%s\n", sname);
}
switch (m) {
case 0: /* executable */
- mod->init_layout.size = debug_align(mod->init_layout.size);
+ mod->init_layout.size = strict_align(mod->init_layout.size);
mod->init_layout.text_size = mod->init_layout.size;
break;
case 1: /* RO: text and ro-data */
- mod->init_layout.size = debug_align(mod->init_layout.size);
+ mod->init_layout.size = strict_align(mod->init_layout.size);
mod->init_layout.ro_size = mod->init_layout.size;
break;
case 2:
@@ -2505,7 +1501,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
mod->init_layout.ro_after_init_size = mod->init_layout.ro_size;
break;
case 4: /* whole init */
- mod->init_layout.size = debug_align(mod->init_layout.size);
+ mod->init_layout.size = strict_align(mod->init_layout.size);
break;
}
}
@@ -2597,228 +1593,6 @@ static void free_modinfo(struct module *mod)
}
}
-#ifdef CONFIG_KALLSYMS
-
-/* Lookup exported symbol in given range of kernel_symbols */
-static const struct kernel_symbol *lookup_exported_symbol(const char *name,
- const struct kernel_symbol *start,
- const struct kernel_symbol *stop)
-{
- return bsearch(name, start, stop - start,
- sizeof(struct kernel_symbol), cmp_name);
-}
-
-static int is_exported(const char *name, unsigned long value,
- const struct module *mod)
-{
- const struct kernel_symbol *ks;
- if (!mod)
- ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab);
- else
- ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms);
-
- return ks != NULL && kernel_symbol_value(ks) == value;
-}
-
-/* As per nm */
-static char elf_type(const Elf_Sym *sym, const struct load_info *info)
-{
- const Elf_Shdr *sechdrs = info->sechdrs;
-
- if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
- if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
- return 'v';
- else
- return 'w';
- }
- if (sym->st_shndx == SHN_UNDEF)
- return 'U';
- if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu)
- return 'a';
- if (sym->st_shndx >= SHN_LORESERVE)
- return '?';
- if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR)
- return 't';
- if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC
- && sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) {
- if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE))
- return 'r';
- else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
- return 'g';
- else
- return 'd';
- }
- if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
- if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
- return 's';
- else
- return 'b';
- }
- if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name,
- ".debug")) {
- return 'n';
- }
- return '?';
-}
-
-static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
- unsigned int shnum, unsigned int pcpundx)
-{
- const Elf_Shdr *sec;
-
- if (src->st_shndx == SHN_UNDEF
- || src->st_shndx >= shnum
- || !src->st_name)
- return false;
-
-#ifdef CONFIG_KALLSYMS_ALL
- if (src->st_shndx == pcpundx)
- return true;
-#endif
-
- sec = sechdrs + src->st_shndx;
- if (!(sec->sh_flags & SHF_ALLOC)
-#ifndef CONFIG_KALLSYMS_ALL
- || !(sec->sh_flags & SHF_EXECINSTR)
-#endif
- || (sec->sh_entsize & INIT_OFFSET_MASK))
- return false;
-
- return true;
-}
-
-/*
- * We only allocate and copy the strings needed by the parts of symtab
- * we keep. This is simple, but has the effect of making multiple
- * copies of duplicates. We could be more sophisticated, see
- * linux-kernel thread starting with
- * <73defb5e4bca04a6431392cc341112b1@localhost>.
- */
-static void layout_symtab(struct module *mod, struct load_info *info)
-{
- Elf_Shdr *symsect = info->sechdrs + info->index.sym;
- Elf_Shdr *strsect = info->sechdrs + info->index.str;
- const Elf_Sym *src;
- unsigned int i, nsrc, ndst, strtab_size = 0;
-
- /* Put symbol section at end of init part of module. */
- symsect->sh_flags |= SHF_ALLOC;
- symsect->sh_entsize = get_offset(mod, &mod->init_layout.size, symsect,
- info->index.sym) | INIT_OFFSET_MASK;
- pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
-
- src = (void *)info->hdr + symsect->sh_offset;
- nsrc = symsect->sh_size / sizeof(*src);
-
- /* Compute total space required for the core symbols' strtab. */
- for (ndst = i = 0; i < nsrc; i++) {
- if (i == 0 || is_livepatch_module(mod) ||
- is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
- info->index.pcpu)) {
- strtab_size += strlen(&info->strtab[src[i].st_name])+1;
- ndst++;
- }
- }
-
- /* Append room for core symbols at end of core part. */
- info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1);
- info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym);
- mod->core_layout.size += strtab_size;
- info->core_typeoffs = mod->core_layout.size;
- mod->core_layout.size += ndst * sizeof(char);
- mod->core_layout.size = debug_align(mod->core_layout.size);
-
- /* Put string table section at end of init part of module. */
- strsect->sh_flags |= SHF_ALLOC;
- strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect,
- info->index.str) | INIT_OFFSET_MASK;
- pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
-
- /* We'll tack temporary mod_kallsyms on the end. */
- mod->init_layout.size = ALIGN(mod->init_layout.size,
- __alignof__(struct mod_kallsyms));
- info->mod_kallsyms_init_off = mod->init_layout.size;
- mod->init_layout.size += sizeof(struct mod_kallsyms);
- info->init_typeoffs = mod->init_layout.size;
- mod->init_layout.size += nsrc * sizeof(char);
- mod->init_layout.size = debug_align(mod->init_layout.size);
-}
-
-/*
- * We use the full symtab and strtab which layout_symtab arranged to
- * be appended to the init section. Later we switch to the cut-down
- * core-only ones.
- */
-static void add_kallsyms(struct module *mod, const struct load_info *info)
-{
- unsigned int i, ndst;
- const Elf_Sym *src;
- Elf_Sym *dst;
- char *s;
- Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
-
- /* Set up to point into init section. */
- mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off;
-
- mod->kallsyms->symtab = (void *)symsec->sh_addr;
- mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
- /* Make sure we get permanent strtab: don't use info->strtab. */
- mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
- mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs;
-
- /*
- * Now populate the cut down core kallsyms for after init
- * and set types up while we still have access to sections.
- */
- mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs;
- mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs;
- mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs;
- src = mod->kallsyms->symtab;
- for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) {
- mod->kallsyms->typetab[i] = elf_type(src + i, info);
- if (i == 0 || is_livepatch_module(mod) ||
- is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
- info->index.pcpu)) {
- mod->core_kallsyms.typetab[ndst] =
- mod->kallsyms->typetab[i];
- dst[ndst] = src[i];
- dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
- s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name],
- KSYM_NAME_LEN) + 1;
- }
- }
- mod->core_kallsyms.num_symtab = ndst;
-}
-#else
-static inline void layout_symtab(struct module *mod, struct load_info *info)
-{
-}
-
-static void add_kallsyms(struct module *mod, const struct load_info *info)
-{
-}
-#endif /* CONFIG_KALLSYMS */
-
-#if IS_ENABLED(CONFIG_KALLSYMS) && IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
-static void init_build_id(struct module *mod, const struct load_info *info)
-{
- const Elf_Shdr *sechdr;
- unsigned int i;
-
- for (i = 0; i < info->hdr->e_shnum; i++) {
- sechdr = &info->sechdrs[i];
- if (!sect_empty(sechdr) && sechdr->sh_type == SHT_NOTE &&
- !build_id_parse_buf((void *)sechdr->sh_addr, mod->build_id,
- sechdr->sh_size))
- break;
- }
-}
-#else
-static void init_build_id(struct module *mod, const struct load_info *info)
-{
-}
-#endif
-
static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsigned int num)
{
if (!debug)
@@ -2849,97 +1623,6 @@ bool __weak module_exit_section(const char *name)
return strstarts(name, ".exit");
}
-#ifdef CONFIG_DEBUG_KMEMLEAK
-static void kmemleak_load_module(const struct module *mod,
- const struct load_info *info)
-{
- unsigned int i;
-
- /* only scan the sections containing data */
- kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
-
- for (i = 1; i < info->hdr->e_shnum; i++) {
- /* Scan all writable sections that's not executable */
- if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) ||
- !(info->sechdrs[i].sh_flags & SHF_WRITE) ||
- (info->sechdrs[i].sh_flags & SHF_EXECINSTR))
- continue;
-
- kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
- info->sechdrs[i].sh_size, GFP_KERNEL);
- }
-}
-#else
-static inline void kmemleak_load_module(const struct module *mod,
- const struct load_info *info)
-{
-}
-#endif
-
-#ifdef CONFIG_MODULE_SIG
-static int module_sig_check(struct load_info *info, int flags)
-{
- int err = -ENODATA;
- const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
- const char *reason;
- const void *mod = info->hdr;
- bool mangled_module = flags & (MODULE_INIT_IGNORE_MODVERSIONS |
- MODULE_INIT_IGNORE_VERMAGIC);
- /*
- * Do not allow mangled modules as a module with version information
- * removed is no longer the module that was signed.
- */
- if (!mangled_module &&
- info->len > markerlen &&
- memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
- /* We truncate the module to discard the signature */
- info->len -= markerlen;
- err = mod_verify_sig(mod, info);
- if (!err) {
- info->sig_ok = true;
- return 0;
- }
- }
-
- /*
- * We don't permit modules to be loaded into the trusted kernels
- * without a valid signature on them, but if we're not enforcing,
- * certain errors are non-fatal.
- */
- switch (err) {
- case -ENODATA:
- reason = "unsigned module";
- break;
- case -ENOPKG:
- reason = "module with unsupported crypto";
- break;
- case -ENOKEY:
- reason = "module with unavailable key";
- break;
-
- default:
- /*
- * All other errors are fatal, including lack of memory,
- * unparseable signatures, and signature check failures --
- * even if signatures aren't required.
- */
- return err;
- }
-
- if (is_module_sig_enforced()) {
- pr_notice("Loading of %s is rejected\n", reason);
- return -EKEYREJECTED;
- }
-
- return security_locked_down(LOCKDOWN_MODULE_SIGNATURE);
-}
-#else /* !CONFIG_MODULE_SIG */
-static int module_sig_check(struct load_info *info, int flags)
-{
- return 0;
-}
-#endif /* !CONFIG_MODULE_SIG */
-
static int validate_section_offset(struct load_info *info, Elf_Shdr *shdr)
{
#if defined(CONFIG_64BIT)
@@ -3033,6 +1716,10 @@ static int elf_validity_check(struct load_info *info)
* strings in the section safe.
*/
info->secstrings = (void *)info->hdr + strhdr->sh_offset;
+ if (strhdr->sh_size == 0) {
+ pr_err("empty section name table\n");
+ goto no_exec;
+ }
if (info->secstrings[strhdr->sh_size - 1] != '\0') {
pr_err("ELF Spec violation: section name table isn't null terminated\n");
goto no_exec;
@@ -3107,30 +1794,23 @@ static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned l
return 0;
}
-#ifdef CONFIG_LIVEPATCH
static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
{
- if (get_modinfo(info, "livepatch")) {
- mod->klp = true;
+ if (!get_modinfo(info, "livepatch"))
+ /* Nothing more to do */
+ return 0;
+
+ if (set_livepatch_module(mod)) {
add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n",
- mod->name);
- }
-
- return 0;
-}
-#else /* !CONFIG_LIVEPATCH */
-static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
-{
- if (get_modinfo(info, "livepatch")) {
- pr_err("%s: module is marked as livepatch module, but livepatch support is disabled",
- mod->name);
- return -ENOEXEC;
+ mod->name);
+ return 0;
}
- return 0;
+ pr_err("%s: module is marked as livepatch module, but livepatch support is disabled",
+ mod->name);
+ return -ENOEXEC;
}
-#endif /* CONFIG_LIVEPATCH */
static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
{
@@ -3456,6 +2136,24 @@ static int move_module(struct module *mod, struct load_info *info)
} else
mod->init_layout.base = NULL;
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ /* Do the allocs. */
+ ptr = vmalloc(mod->data_layout.size);
+ /*
+ * The pointer to this block is stored in the module structure
+ * which is inside the block. Just mark it as not being a
+ * leak.
+ */
+ kmemleak_not_leak(ptr);
+ if (!ptr) {
+ module_memfree(mod->core_layout.base);
+ module_memfree(mod->init_layout.base);
+ return -ENOMEM;
+ }
+
+ memset(ptr, 0, mod->data_layout.size);
+ mod->data_layout.base = ptr;
+#endif
/* Transfer each section which specifies SHF_ALLOC */
pr_debug("final section addresses:\n");
for (i = 0; i < info->hdr->e_shnum; i++) {
@@ -3468,6 +2166,8 @@ static int move_module(struct module *mod, struct load_info *info)
if (shdr->sh_entsize & INIT_OFFSET_MASK)
dest = mod->init_layout.base
+ (shdr->sh_entsize & ~INIT_OFFSET_MASK);
+ else if (!(shdr->sh_flags & SHF_EXECINSTR))
+ dest = mod->data_layout.base + shdr->sh_entsize;
else
dest = mod->core_layout.base + shdr->sh_entsize;
@@ -3629,6 +2329,9 @@ static void module_deallocate(struct module *mod, struct load_info *info)
module_arch_freeing_init(mod);
module_memfree(mod->init_layout.base);
module_memfree(mod->core_layout.base);
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ vfree(mod->data_layout.base);
+#endif
}
int __weak module_finalize(const Elf_Ehdr *hdr,
@@ -3879,6 +2582,9 @@ static int complete_formation(struct module *mod, struct load_info *info)
/* This relies on module_mutex for list integrity. */
module_bug_finalize(info->hdr, info->sechdrs, mod);
+ if (module_check_misalignment(mod))
+ goto out_misaligned;
+
module_enable_ro(mod, false);
module_enable_nx(mod);
module_enable_x(mod);
@@ -3892,6 +2598,8 @@ static int complete_formation(struct module *mod, struct load_info *info)
return 0;
+out_misaligned:
+ err = -EINVAL;
out:
mutex_unlock(&module_mutex);
return err;
@@ -4158,7 +2866,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
mutex_unlock(&module_mutex);
free_module:
/* Free lock-classes; relies on the preceding sync_rcu() */
- lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
+ lockdep_free_key_range(mod->data_layout.base, mod->data_layout.size);
module_deallocate(mod, info);
free_copy:
@@ -4227,287 +2935,6 @@ static inline int within(unsigned long addr, void *start, unsigned long size)
return ((void *)addr >= start && (void *)addr < start + size);
}
-#ifdef CONFIG_KALLSYMS
-/*
- * This ignores the intensely annoying "mapping symbols" found
- * in ARM ELF files: $a, $t and $d.
- */
-static inline int is_arm_mapping_symbol(const char *str)
-{
- if (str[0] == '.' && str[1] == 'L')
- return true;
- return str[0] == '$' && strchr("axtd", str[1])
- && (str[2] == '\0' || str[2] == '.');
-}
-
-static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum)
-{
- return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
-}
-
-/*
- * Given a module and address, find the corresponding symbol and return its name
- * while providing its size and offset if needed.
- */
-static const char *find_kallsyms_symbol(struct module *mod,
- unsigned long addr,
- unsigned long *size,
- unsigned long *offset)
-{
- unsigned int i, best = 0;
- unsigned long nextval, bestval;
- struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
-
- /* At worse, next value is at end of module */
- if (within_module_init(addr, mod))
- nextval = (unsigned long)mod->init_layout.base+mod->init_layout.text_size;
- else
- nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size;
-
- bestval = kallsyms_symbol_value(&kallsyms->symtab[best]);
-
- /*
- * Scan for closest preceding symbol, and next symbol. (ELF
- * starts real symbols at 1).
- */
- for (i = 1; i < kallsyms->num_symtab; i++) {
- const Elf_Sym *sym = &kallsyms->symtab[i];
- unsigned long thisval = kallsyms_symbol_value(sym);
-
- if (sym->st_shndx == SHN_UNDEF)
- continue;
-
- /*
- * We ignore unnamed symbols: they're uninformative
- * and inserted at a whim.
- */
- if (*kallsyms_symbol_name(kallsyms, i) == '\0'
- || is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i)))
- continue;
-
- if (thisval <= addr && thisval > bestval) {
- best = i;
- bestval = thisval;
- }
- if (thisval > addr && thisval < nextval)
- nextval = thisval;
- }
-
- if (!best)
- return NULL;
-
- if (size)
- *size = nextval - bestval;
- if (offset)
- *offset = addr - bestval;
-
- return kallsyms_symbol_name(kallsyms, best);
-}
-
-void * __weak dereference_module_function_descriptor(struct module *mod,
- void *ptr)
-{
- return ptr;
-}
-
-/*
- * For kallsyms to ask for address resolution. NULL means not found. Careful
- * not to lock to avoid deadlock on oopses, simply disable preemption.
- */
-const char *module_address_lookup(unsigned long addr,
- unsigned long *size,
- unsigned long *offset,
- char **modname,
- const unsigned char **modbuildid,
- char *namebuf)
-{
- const char *ret = NULL;
- struct module *mod;
-
- preempt_disable();
- mod = __module_address(addr);
- if (mod) {
- if (modname)
- *modname = mod->name;
- if (modbuildid) {
-#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
- *modbuildid = mod->build_id;
-#else
- *modbuildid = NULL;
-#endif
- }
-
- ret = find_kallsyms_symbol(mod, addr, size, offset);
- }
- /* Make a copy in here where it's safe */
- if (ret) {
- strncpy(namebuf, ret, KSYM_NAME_LEN - 1);
- ret = namebuf;
- }
- preempt_enable();
-
- return ret;
-}
-
-int lookup_module_symbol_name(unsigned long addr, char *symname)
-{
- struct module *mod;
-
- preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (within_module(addr, mod)) {
- const char *sym;
-
- sym = find_kallsyms_symbol(mod, addr, NULL, NULL);
- if (!sym)
- goto out;
-
- strlcpy(symname, sym, KSYM_NAME_LEN);
- preempt_enable();
- return 0;
- }
- }
-out:
- preempt_enable();
- return -ERANGE;
-}
-
-int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
- unsigned long *offset, char *modname, char *name)
-{
- struct module *mod;
-
- preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (within_module(addr, mod)) {
- const char *sym;
-
- sym = find_kallsyms_symbol(mod, addr, size, offset);
- if (!sym)
- goto out;
- if (modname)
- strlcpy(modname, mod->name, MODULE_NAME_LEN);
- if (name)
- strlcpy(name, sym, KSYM_NAME_LEN);
- preempt_enable();
- return 0;
- }
- }
-out:
- preempt_enable();
- return -ERANGE;
-}
-
-int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
- char *name, char *module_name, int *exported)
-{
- struct module *mod;
-
- preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list) {
- struct mod_kallsyms *kallsyms;
-
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- kallsyms = rcu_dereference_sched(mod->kallsyms);
- if (symnum < kallsyms->num_symtab) {
- const Elf_Sym *sym = &kallsyms->symtab[symnum];
-
- *value = kallsyms_symbol_value(sym);
- *type = kallsyms->typetab[symnum];
- strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN);
- strlcpy(module_name, mod->name, MODULE_NAME_LEN);
- *exported = is_exported(name, *value, mod);
- preempt_enable();
- return 0;
- }
- symnum -= kallsyms->num_symtab;
- }
- preempt_enable();
- return -ERANGE;
-}
-
-/* Given a module and name of symbol, find and return the symbol's value */
-static unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
-{
- unsigned int i;
- struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
-
- for (i = 0; i < kallsyms->num_symtab; i++) {
- const Elf_Sym *sym = &kallsyms->symtab[i];
-
- if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 &&
- sym->st_shndx != SHN_UNDEF)
- return kallsyms_symbol_value(sym);
- }
- return 0;
-}
-
-/* Look for this name: can be of form module:name. */
-unsigned long module_kallsyms_lookup_name(const char *name)
-{
- struct module *mod;
- char *colon;
- unsigned long ret = 0;
-
- /* Don't lock: we're in enough trouble already. */
- preempt_disable();
- if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) {
- if ((mod = find_module_all(name, colon - name, false)) != NULL)
- ret = find_kallsyms_symbol_value(mod, colon+1);
- } else {
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if ((ret = find_kallsyms_symbol_value(mod, name)) != 0)
- break;
- }
- }
- preempt_enable();
- return ret;
-}
-
-#ifdef CONFIG_LIVEPATCH
-int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
- struct module *, unsigned long),
- void *data)
-{
- struct module *mod;
- unsigned int i;
- int ret = 0;
-
- mutex_lock(&module_mutex);
- list_for_each_entry(mod, &modules, list) {
- /* We hold module_mutex: no need for rcu_dereference_sched */
- struct mod_kallsyms *kallsyms = mod->kallsyms;
-
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- for (i = 0; i < kallsyms->num_symtab; i++) {
- const Elf_Sym *sym = &kallsyms->symtab[i];
-
- if (sym->st_shndx == SHN_UNDEF)
- continue;
-
- ret = fn(data, kallsyms_symbol_name(kallsyms, i),
- mod, kallsyms_symbol_value(sym));
- if (ret != 0)
- goto out;
-
- cond_resched();
- }
- }
-out:
- mutex_unlock(&module_mutex);
- return ret;
-}
-#endif /* CONFIG_LIVEPATCH */
-#endif /* CONFIG_KALLSYMS */
-
static void cfi_init(struct module *mod)
{
#ifdef CONFIG_CFI_CLANG
@@ -4531,22 +2958,19 @@ static void cfi_init(struct module *mod)
mod->exit = *exit;
#endif
- cfi_module_add(mod, module_addr_min);
+ cfi_module_add(mod, mod_tree.addr_min);
#endif
}
static void cfi_cleanup(struct module *mod)
{
#ifdef CONFIG_CFI_CLANG
- cfi_module_remove(mod, module_addr_min);
+ cfi_module_remove(mod, mod_tree.addr_min);
#endif
}
-/* Maximum number of characters written by module_flags() */
-#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
-
/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */
-static char *module_flags(struct module *mod, char *buf)
+char *module_flags(struct module *mod, char *buf)
{
int bx = 0;
@@ -4555,7 +2979,7 @@ static char *module_flags(struct module *mod, char *buf)
mod->state == MODULE_STATE_GOING ||
mod->state == MODULE_STATE_COMING) {
buf[bx++] = '(';
- bx += module_flags_taint(mod, buf + bx);
+ bx += module_flags_taint(mod->taints, buf + bx);
/* Show a - for module-is-being-unloaded */
if (mod->state == MODULE_STATE_GOING)
buf[bx++] = '-';
@@ -4569,103 +2993,6 @@ static char *module_flags(struct module *mod, char *buf)
return buf;
}
-#ifdef CONFIG_PROC_FS
-/* Called by the /proc file system to return a list of modules. */
-static void *m_start(struct seq_file *m, loff_t *pos)
-{
- mutex_lock(&module_mutex);
- return seq_list_start(&modules, *pos);
-}
-
-static void *m_next(struct seq_file *m, void *p, loff_t *pos)
-{
- return seq_list_next(p, &modules, pos);
-}
-
-static void m_stop(struct seq_file *m, void *p)
-{
- mutex_unlock(&module_mutex);
-}
-
-static int m_show(struct seq_file *m, void *p)
-{
- struct module *mod = list_entry(p, struct module, list);
- char buf[MODULE_FLAGS_BUF_SIZE];
- void *value;
-
- /* We always ignore unformed modules. */
- if (mod->state == MODULE_STATE_UNFORMED)
- return 0;
-
- seq_printf(m, "%s %u",
- mod->name, mod->init_layout.size + mod->core_layout.size);
- print_unload_info(m, mod);
-
- /* Informative for users. */
- seq_printf(m, " %s",
- mod->state == MODULE_STATE_GOING ? "Unloading" :
- mod->state == MODULE_STATE_COMING ? "Loading" :
- "Live");
- /* Used by oprofile and other similar tools. */
- value = m->private ? NULL : mod->core_layout.base;
- seq_printf(m, " 0x%px", value);
-
- /* Taints info */
- if (mod->taints)
- seq_printf(m, " %s", module_flags(mod, buf));
-
- seq_puts(m, "\n");
- return 0;
-}
-
-/*
- * Format: modulename size refcount deps address
- *
- * Where refcount is a number or -, and deps is a comma-separated list
- * of depends or -.
- */
-static const struct seq_operations modules_op = {
- .start = m_start,
- .next = m_next,
- .stop = m_stop,
- .show = m_show
-};
-
-/*
- * This also sets the "private" pointer to non-NULL if the
- * kernel pointers should be hidden (so you can just test
- * "m->private" to see if you should keep the values private).
- *
- * We use the same logic as for /proc/kallsyms.
- */
-static int modules_open(struct inode *inode, struct file *file)
-{
- int err = seq_open(file, &modules_op);
-
- if (!err) {
- struct seq_file *m = file->private_data;
- m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul;
- }
-
- return err;
-}
-
-static const struct proc_ops modules_proc_ops = {
- .proc_flags = PROC_ENTRY_PERMANENT,
- .proc_open = modules_open,
- .proc_read = seq_read,
- .proc_lseek = seq_lseek,
- .proc_release = seq_release,
-};
-
-static int __init proc_modules_init(void)
-{
- proc_create("modules", 0, NULL, &modules_proc_ops);
- return 0;
-}
-module_init(proc_modules_init);
-#endif
-
/* Given an address, look for it in the module exception tables. */
const struct exception_table_entry *search_module_extables(unsigned long addr)
{
@@ -4721,13 +3048,20 @@ bool is_module_address(unsigned long addr)
struct module *__module_address(unsigned long addr)
{
struct module *mod;
+ struct mod_tree_root *tree;
- if (addr < module_addr_min || addr > module_addr_max)
+ if (addr >= mod_tree.addr_min && addr <= mod_tree.addr_max)
+ tree = &mod_tree;
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ else if (addr >= mod_data_tree.addr_min && addr <= mod_data_tree.addr_max)
+ tree = &mod_data_tree;
+#endif
+ else
return NULL;
module_assert_mutex_or_preempt();
- mod = mod_find(addr);
+ mod = mod_find(addr, tree);
if (mod) {
BUG_ON(!within_module(addr, mod));
if (mod->state == MODULE_STATE_UNFORMED)
@@ -4788,23 +3122,10 @@ void print_modules(void)
continue;
pr_cont(" %s%s", mod->name, module_flags(mod, buf));
}
+
+ print_unloaded_tainted_modules();
preempt_enable();
if (last_unloaded_module[0])
pr_cont(" [last unloaded: %s]", last_unloaded_module);
pr_cont("\n");
}
-
-#ifdef CONFIG_MODVERSIONS
-/*
- * Generate the signature for all relevant module structures here.
- * If these change, we don't want to try to parse the module.
- */
-void module_layout(struct module *mod,
- struct modversion_info *ver,
- struct kernel_param *kp,
- struct kernel_symbol *ks,
- struct tracepoint * const *tp)
-{
-}
-EXPORT_SYMBOL(module_layout);
-#endif
diff --git a/kernel/module/procfs.c b/kernel/module/procfs.c
new file mode 100644
index 000000000000..9a8f4f0f6329
--- /dev/null
+++ b/kernel/module/procfs.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module proc support
+ *
+ * Copyright (C) 2008 Alexey Dobriyan
+ */
+
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/mutex.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include "internal.h"
+
+#ifdef CONFIG_MODULE_UNLOAD
+static inline void print_unload_info(struct seq_file *m, struct module *mod)
+{
+ struct module_use *use;
+ int printed_something = 0;
+
+ seq_printf(m, " %i ", module_refcount(mod));
+
+ /*
+ * Always include a trailing , so userspace can differentiate
+ * between this and the old multi-field proc format.
+ */
+ list_for_each_entry(use, &mod->source_list, source_list) {
+ printed_something = 1;
+ seq_printf(m, "%s,", use->source->name);
+ }
+
+ if (mod->init && !mod->exit) {
+ printed_something = 1;
+ seq_puts(m, "[permanent],");
+ }
+
+ if (!printed_something)
+ seq_puts(m, "-");
+}
+#else /* !CONFIG_MODULE_UNLOAD */
+static inline void print_unload_info(struct seq_file *m, struct module *mod)
+{
+ /* We don't know the usage count, or what modules are using. */
+ seq_puts(m, " - -");
+}
+#endif /* CONFIG_MODULE_UNLOAD */
+
+/* Called by the /proc file system to return a list of modules. */
+static void *m_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&module_mutex);
+ return seq_list_start(&modules, *pos);
+}
+
+static void *m_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &modules, pos);
+}
+
+static void m_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&module_mutex);
+}
+
+static int m_show(struct seq_file *m, void *p)
+{
+ struct module *mod = list_entry(p, struct module, list);
+ char buf[MODULE_FLAGS_BUF_SIZE];
+ void *value;
+ unsigned int size;
+
+ /* We always ignore unformed modules. */
+ if (mod->state == MODULE_STATE_UNFORMED)
+ return 0;
+
+ size = mod->init_layout.size + mod->core_layout.size;
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ size += mod->data_layout.size;
+#endif
+ seq_printf(m, "%s %u", mod->name, size);
+ print_unload_info(m, mod);
+
+ /* Informative for users. */
+ seq_printf(m, " %s",
+ mod->state == MODULE_STATE_GOING ? "Unloading" :
+ mod->state == MODULE_STATE_COMING ? "Loading" :
+ "Live");
+ /* Used by oprofile and other similar tools. */
+ value = m->private ? NULL : mod->core_layout.base;
+ seq_printf(m, " 0x%px", value);
+
+ /* Taints info */
+ if (mod->taints)
+ seq_printf(m, " %s", module_flags(mod, buf));
+
+ seq_puts(m, "\n");
+ return 0;
+}
+
+/*
+ * Format: modulename size refcount deps address
+ *
+ * Where refcount is a number or -, and deps is a comma-separated list
+ * of depends or -.
+ */
+static const struct seq_operations modules_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = m_show
+};
+
+/*
+ * This also sets the "private" pointer to non-NULL if the
+ * kernel pointers should be hidden (so you can just test
+ * "m->private" to see if you should keep the values private).
+ *
+ * We use the same logic as for /proc/kallsyms.
+ */
+static int modules_open(struct inode *inode, struct file *file)
+{
+ int err = seq_open(file, &modules_op);
+
+ if (!err) {
+ struct seq_file *m = file->private_data;
+
+ m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul;
+ }
+
+ return err;
+}
+
+static const struct proc_ops modules_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
+ .proc_open = modules_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
+};
+
+static int __init proc_modules_init(void)
+{
+ proc_create("modules", 0, NULL, &modules_proc_ops);
+ return 0;
+}
+module_init(proc_modules_init);
diff --git a/kernel/module/signing.c b/kernel/module/signing.c
new file mode 100644
index 000000000000..a2ff4242e623
--- /dev/null
+++ b/kernel/module/signing.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Module signature checker
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/module_signature.h>
+#include <linux/string.h>
+#include <linux/verification.h>
+#include <linux/security.h>
+#include <crypto/public_key.h>
+#include <uapi/linux/module.h>
+#include "internal.h"
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "module."
+
+static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE);
+module_param(sig_enforce, bool_enable_only, 0644);
+
+/*
+ * Export sig_enforce kernel cmdline parameter to allow other subsystems rely
+ * on that instead of directly to CONFIG_MODULE_SIG_FORCE config.
+ */
+bool is_module_sig_enforced(void)
+{
+ return sig_enforce;
+}
+EXPORT_SYMBOL(is_module_sig_enforced);
+
+void set_module_sig_enforced(void)
+{
+ sig_enforce = true;
+}
+
+/*
+ * Verify the signature on a module.
+ */
+int mod_verify_sig(const void *mod, struct load_info *info)
+{
+ struct module_signature ms;
+ size_t sig_len, modlen = info->len;
+ int ret;
+
+ pr_devel("==>%s(,%zu)\n", __func__, modlen);
+
+ if (modlen <= sizeof(ms))
+ return -EBADMSG;
+
+ memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
+
+ ret = mod_check_sig(&ms, modlen, "module");
+ if (ret)
+ return ret;
+
+ sig_len = be32_to_cpu(ms.sig_len);
+ modlen -= sig_len + sizeof(ms);
+ info->len = modlen;
+
+ return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
+ VERIFY_USE_SECONDARY_KEYRING,
+ VERIFYING_MODULE_SIGNATURE,
+ NULL, NULL);
+}
+
+int module_sig_check(struct load_info *info, int flags)
+{
+ int err = -ENODATA;
+ const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
+ const char *reason;
+ const void *mod = info->hdr;
+ bool mangled_module = flags & (MODULE_INIT_IGNORE_MODVERSIONS |
+ MODULE_INIT_IGNORE_VERMAGIC);
+ /*
+ * Do not allow mangled modules as a module with version information
+ * removed is no longer the module that was signed.
+ */
+ if (!mangled_module &&
+ info->len > markerlen &&
+ memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
+ /* We truncate the module to discard the signature */
+ info->len -= markerlen;
+ err = mod_verify_sig(mod, info);
+ if (!err) {
+ info->sig_ok = true;
+ return 0;
+ }
+ }
+
+ /*
+ * We don't permit modules to be loaded into the trusted kernels
+ * without a valid signature on them, but if we're not enforcing,
+ * certain errors are non-fatal.
+ */
+ switch (err) {
+ case -ENODATA:
+ reason = "unsigned module";
+ break;
+ case -ENOPKG:
+ reason = "module with unsupported crypto";
+ break;
+ case -ENOKEY:
+ reason = "module with unavailable key";
+ break;
+
+ default:
+ /*
+ * All other errors are fatal, including lack of memory,
+ * unparseable signatures, and signature check failures --
+ * even if signatures aren't required.
+ */
+ return err;
+ }
+
+ if (is_module_sig_enforced()) {
+ pr_notice("Loading of %s is rejected\n", reason);
+ return -EKEYREJECTED;
+ }
+
+ return security_locked_down(LOCKDOWN_MODULE_SIGNATURE);
+}
diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c
new file mode 100644
index 000000000000..14fbea66f12f
--- /dev/null
+++ b/kernel/module/strict_rwx.c
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module strict rwx
+ *
+ * Copyright (C) 2015 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/set_memory.h>
+#include "internal.h"
+
+/*
+ * LKM RO/NX protection: protect module's text/ro-data
+ * from modification and any data from execution.
+ *
+ * General layout of module is:
+ * [text] [read-only-data] [ro-after-init] [writable data]
+ * text_size -----^ ^ ^ ^
+ * ro_size ------------------------| | |
+ * ro_after_init_size -----------------------------| |
+ * size -----------------------------------------------------------|
+ *
+ * These values are always page-aligned (as is base) when
+ * CONFIG_STRICT_MODULE_RWX is set.
+ */
+
+/*
+ * Since some arches are moving towards PAGE_KERNEL module allocations instead
+ * of PAGE_KERNEL_EXEC, keep frob_text() and module_enable_x() independent of
+ * CONFIG_STRICT_MODULE_RWX because they are needed regardless of whether we
+ * are strict.
+ */
+static void frob_text(const struct module_layout *layout,
+ int (*set_memory)(unsigned long start, int num_pages))
+{
+ set_memory((unsigned long)layout->base,
+ PAGE_ALIGN(layout->text_size) >> PAGE_SHIFT);
+}
+
+static void frob_rodata(const struct module_layout *layout,
+ int (*set_memory)(unsigned long start, int num_pages))
+{
+ set_memory((unsigned long)layout->base + layout->text_size,
+ (layout->ro_size - layout->text_size) >> PAGE_SHIFT);
+}
+
+static void frob_ro_after_init(const struct module_layout *layout,
+ int (*set_memory)(unsigned long start, int num_pages))
+{
+ set_memory((unsigned long)layout->base + layout->ro_size,
+ (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT);
+}
+
+static void frob_writable_data(const struct module_layout *layout,
+ int (*set_memory)(unsigned long start, int num_pages))
+{
+ set_memory((unsigned long)layout->base + layout->ro_after_init_size,
+ (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT);
+}
+
+static bool layout_check_misalignment(const struct module_layout *layout)
+{
+ return WARN_ON(!PAGE_ALIGNED(layout->base)) ||
+ WARN_ON(!PAGE_ALIGNED(layout->text_size)) ||
+ WARN_ON(!PAGE_ALIGNED(layout->ro_size)) ||
+ WARN_ON(!PAGE_ALIGNED(layout->ro_after_init_size)) ||
+ WARN_ON(!PAGE_ALIGNED(layout->size));
+}
+
+bool module_check_misalignment(const struct module *mod)
+{
+ if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ return false;
+
+ return layout_check_misalignment(&mod->core_layout) ||
+ layout_check_misalignment(&mod->data_layout) ||
+ layout_check_misalignment(&mod->init_layout);
+}
+
+void module_enable_x(const struct module *mod)
+{
+ if (!PAGE_ALIGNED(mod->core_layout.base) ||
+ !PAGE_ALIGNED(mod->init_layout.base))
+ return;
+
+ frob_text(&mod->core_layout, set_memory_x);
+ frob_text(&mod->init_layout, set_memory_x);
+}
+
+void module_enable_ro(const struct module *mod, bool after_init)
+{
+ if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ return;
+#ifdef CONFIG_STRICT_MODULE_RWX
+ if (!rodata_enabled)
+ return;
+#endif
+
+ set_vm_flush_reset_perms(mod->core_layout.base);
+ set_vm_flush_reset_perms(mod->init_layout.base);
+ frob_text(&mod->core_layout, set_memory_ro);
+
+ frob_rodata(&mod->data_layout, set_memory_ro);
+ frob_text(&mod->init_layout, set_memory_ro);
+ frob_rodata(&mod->init_layout, set_memory_ro);
+
+ if (after_init)
+ frob_ro_after_init(&mod->data_layout, set_memory_ro);
+}
+
+void module_enable_nx(const struct module *mod)
+{
+ if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ return;
+
+ frob_rodata(&mod->data_layout, set_memory_nx);
+ frob_ro_after_init(&mod->data_layout, set_memory_nx);
+ frob_writable_data(&mod->data_layout, set_memory_nx);
+ frob_rodata(&mod->init_layout, set_memory_nx);
+ frob_writable_data(&mod->init_layout, set_memory_nx);
+}
+
+int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
+ char *secstrings, struct module *mod)
+{
+ const unsigned long shf_wx = SHF_WRITE | SHF_EXECINSTR;
+ int i;
+
+ if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ return 0;
+
+ for (i = 0; i < hdr->e_shnum; i++) {
+ if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) {
+ pr_err("%s: section %s (index %d) has invalid WRITE|EXEC flags\n",
+ mod->name, secstrings + sechdrs[i].sh_name, i);
+ return -ENOEXEC;
+ }
+ }
+
+ return 0;
+}
diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c
new file mode 100644
index 000000000000..ce68f821dcd1
--- /dev/null
+++ b/kernel/module/sysfs.c
@@ -0,0 +1,436 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module sysfs support
+ *
+ * Copyright (C) 2008 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/kallsyms.h>
+#include <linux/mutex.h>
+#include "internal.h"
+
+/*
+ * /sys/module/foo/sections stuff
+ * J. Corbet <corbet@lwn.net>
+ */
+#ifdef CONFIG_KALLSYMS
+struct module_sect_attr {
+ struct bin_attribute battr;
+ unsigned long address;
+};
+
+struct module_sect_attrs {
+ struct attribute_group grp;
+ unsigned int nsections;
+ struct module_sect_attr attrs[];
+};
+
+#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4))
+static ssize_t module_sect_read(struct file *file, struct kobject *kobj,
+ struct bin_attribute *battr,
+ char *buf, loff_t pos, size_t count)
+{
+ struct module_sect_attr *sattr =
+ container_of(battr, struct module_sect_attr, battr);
+ char bounce[MODULE_SECT_READ_SIZE + 1];
+ size_t wrote;
+
+ if (pos != 0)
+ return -EINVAL;
+
+ /*
+ * Since we're a binary read handler, we must account for the
+ * trailing NUL byte that sprintf will write: if "buf" is
+ * too small to hold the NUL, or the NUL is exactly the last
+ * byte, the read will look like it got truncated by one byte.
+ * Since there is no way to ask sprintf nicely to not write
+ * the NUL, we have to use a bounce buffer.
+ */
+ wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n",
+ kallsyms_show_value(file->f_cred)
+ ? (void *)sattr->address : NULL);
+ count = min(count, wrote);
+ memcpy(buf, bounce, count);
+
+ return count;
+}
+
+static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
+{
+ unsigned int section;
+
+ for (section = 0; section < sect_attrs->nsections; section++)
+ kfree(sect_attrs->attrs[section].battr.attr.name);
+ kfree(sect_attrs);
+}
+
+static void add_sect_attrs(struct module *mod, const struct load_info *info)
+{
+ unsigned int nloaded = 0, i, size[2];
+ struct module_sect_attrs *sect_attrs;
+ struct module_sect_attr *sattr;
+ struct bin_attribute **gattr;
+
+ /* Count loaded sections and allocate structures */
+ for (i = 0; i < info->hdr->e_shnum; i++)
+ if (!sect_empty(&info->sechdrs[i]))
+ nloaded++;
+ size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded),
+ sizeof(sect_attrs->grp.bin_attrs[0]));
+ size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]);
+ sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
+ if (!sect_attrs)
+ return;
+
+ /* Setup section attributes. */
+ sect_attrs->grp.name = "sections";
+ sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0];
+
+ sect_attrs->nsections = 0;
+ sattr = &sect_attrs->attrs[0];
+ gattr = &sect_attrs->grp.bin_attrs[0];
+ for (i = 0; i < info->hdr->e_shnum; i++) {
+ Elf_Shdr *sec = &info->sechdrs[i];
+
+ if (sect_empty(sec))
+ continue;
+ sysfs_bin_attr_init(&sattr->battr);
+ sattr->address = sec->sh_addr;
+ sattr->battr.attr.name =
+ kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL);
+ if (!sattr->battr.attr.name)
+ goto out;
+ sect_attrs->nsections++;
+ sattr->battr.read = module_sect_read;
+ sattr->battr.size = MODULE_SECT_READ_SIZE;
+ sattr->battr.attr.mode = 0400;
+ *(gattr++) = &(sattr++)->battr;
+ }
+ *gattr = NULL;
+
+ if (sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp))
+ goto out;
+
+ mod->sect_attrs = sect_attrs;
+ return;
+out:
+ free_sect_attrs(sect_attrs);
+}
+
+static void remove_sect_attrs(struct module *mod)
+{
+ if (mod->sect_attrs) {
+ sysfs_remove_group(&mod->mkobj.kobj,
+ &mod->sect_attrs->grp);
+ /*
+ * We are positive that no one is using any sect attrs
+ * at this point. Deallocate immediately.
+ */
+ free_sect_attrs(mod->sect_attrs);
+ mod->sect_attrs = NULL;
+ }
+}
+
+/*
+ * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections.
+ */
+
+struct module_notes_attrs {
+ struct kobject *dir;
+ unsigned int notes;
+ struct bin_attribute attrs[];
+};
+
+static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t pos, size_t count)
+{
+ /*
+ * The caller checked the pos and count against our size.
+ */
+ memcpy(buf, bin_attr->private + pos, count);
+ return count;
+}
+
+static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
+ unsigned int i)
+{
+ if (notes_attrs->dir) {
+ while (i-- > 0)
+ sysfs_remove_bin_file(notes_attrs->dir,
+ &notes_attrs->attrs[i]);
+ kobject_put(notes_attrs->dir);
+ }
+ kfree(notes_attrs);
+}
+
+static void add_notes_attrs(struct module *mod, const struct load_info *info)
+{
+ unsigned int notes, loaded, i;
+ struct module_notes_attrs *notes_attrs;
+ struct bin_attribute *nattr;
+
+ /* failed to create section attributes, so can't create notes */
+ if (!mod->sect_attrs)
+ return;
+
+ /* Count notes sections and allocate structures. */
+ notes = 0;
+ for (i = 0; i < info->hdr->e_shnum; i++)
+ if (!sect_empty(&info->sechdrs[i]) &&
+ info->sechdrs[i].sh_type == SHT_NOTE)
+ ++notes;
+
+ if (notes == 0)
+ return;
+
+ notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes),
+ GFP_KERNEL);
+ if (!notes_attrs)
+ return;
+
+ notes_attrs->notes = notes;
+ nattr = &notes_attrs->attrs[0];
+ for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
+ if (sect_empty(&info->sechdrs[i]))
+ continue;
+ if (info->sechdrs[i].sh_type == SHT_NOTE) {
+ sysfs_bin_attr_init(nattr);
+ nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name;
+ nattr->attr.mode = 0444;
+ nattr->size = info->sechdrs[i].sh_size;
+ nattr->private = (void *)info->sechdrs[i].sh_addr;
+ nattr->read = module_notes_read;
+ ++nattr;
+ }
+ ++loaded;
+ }
+
+ notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj);
+ if (!notes_attrs->dir)
+ goto out;
+
+ for (i = 0; i < notes; ++i)
+ if (sysfs_create_bin_file(notes_attrs->dir,
+ &notes_attrs->attrs[i]))
+ goto out;
+
+ mod->notes_attrs = notes_attrs;
+ return;
+
+out:
+ free_notes_attrs(notes_attrs, i);
+}
+
+static void remove_notes_attrs(struct module *mod)
+{
+ if (mod->notes_attrs)
+ free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes);
+}
+
+#else /* !CONFIG_KALLSYMS */
+static inline void add_sect_attrs(struct module *mod, const struct load_info *info) { }
+static inline void remove_sect_attrs(struct module *mod) { }
+static inline void add_notes_attrs(struct module *mod, const struct load_info *info) { }
+static inline void remove_notes_attrs(struct module *mod) { }
+#endif /* CONFIG_KALLSYMS */
+
+static void del_usage_links(struct module *mod)
+{
+#ifdef CONFIG_MODULE_UNLOAD
+ struct module_use *use;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(use, &mod->target_list, target_list)
+ sysfs_remove_link(use->target->holders_dir, mod->name);
+ mutex_unlock(&module_mutex);
+#endif
+}
+
+static int add_usage_links(struct module *mod)
+{
+ int ret = 0;
+#ifdef CONFIG_MODULE_UNLOAD
+ struct module_use *use;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(use, &mod->target_list, target_list) {
+ ret = sysfs_create_link(use->target->holders_dir,
+ &mod->mkobj.kobj, mod->name);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&module_mutex);
+ if (ret)
+ del_usage_links(mod);
+#endif
+ return ret;
+}
+
+static void module_remove_modinfo_attrs(struct module *mod, int end)
+{
+ struct module_attribute *attr;
+ int i;
+
+ for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
+ if (end >= 0 && i > end)
+ break;
+ /* pick a field to test for end of list */
+ if (!attr->attr.name)
+ break;
+ sysfs_remove_file(&mod->mkobj.kobj, &attr->attr);
+ if (attr->free)
+ attr->free(mod);
+ }
+ kfree(mod->modinfo_attrs);
+}
+
+static int module_add_modinfo_attrs(struct module *mod)
+{
+ struct module_attribute *attr;
+ struct module_attribute *temp_attr;
+ int error = 0;
+ int i;
+
+ mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) *
+ (modinfo_attrs_count + 1)),
+ GFP_KERNEL);
+ if (!mod->modinfo_attrs)
+ return -ENOMEM;
+
+ temp_attr = mod->modinfo_attrs;
+ for (i = 0; (attr = modinfo_attrs[i]); i++) {
+ if (!attr->test || attr->test(mod)) {
+ memcpy(temp_attr, attr, sizeof(*temp_attr));
+ sysfs_attr_init(&temp_attr->attr);
+ error = sysfs_create_file(&mod->mkobj.kobj,
+ &temp_attr->attr);
+ if (error)
+ goto error_out;
+ ++temp_attr;
+ }
+ }
+
+ return 0;
+
+error_out:
+ if (i > 0)
+ module_remove_modinfo_attrs(mod, --i);
+ else
+ kfree(mod->modinfo_attrs);
+ return error;
+}
+
+static void mod_kobject_put(struct module *mod)
+{
+ DECLARE_COMPLETION_ONSTACK(c);
+
+ mod->mkobj.kobj_completion = &c;
+ kobject_put(&mod->mkobj.kobj);
+ wait_for_completion(&c);
+}
+
+static int mod_sysfs_init(struct module *mod)
+{
+ int err;
+ struct kobject *kobj;
+
+ if (!module_sysfs_initialized) {
+ pr_err("%s: module sysfs not initialized\n", mod->name);
+ err = -EINVAL;
+ goto out;
+ }
+
+ kobj = kset_find_obj(module_kset, mod->name);
+ if (kobj) {
+ pr_err("%s: module is already loaded\n", mod->name);
+ kobject_put(kobj);
+ err = -EINVAL;
+ goto out;
+ }
+
+ mod->mkobj.mod = mod;
+
+ memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
+ mod->mkobj.kobj.kset = module_kset;
+ err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
+ "%s", mod->name);
+ if (err)
+ mod_kobject_put(mod);
+
+out:
+ return err;
+}
+
+int mod_sysfs_setup(struct module *mod,
+ const struct load_info *info,
+ struct kernel_param *kparam,
+ unsigned int num_params)
+{
+ int err;
+
+ err = mod_sysfs_init(mod);
+ if (err)
+ goto out;
+
+ mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
+ if (!mod->holders_dir) {
+ err = -ENOMEM;
+ goto out_unreg;
+ }
+
+ err = module_param_sysfs_setup(mod, kparam, num_params);
+ if (err)
+ goto out_unreg_holders;
+
+ err = module_add_modinfo_attrs(mod);
+ if (err)
+ goto out_unreg_param;
+
+ err = add_usage_links(mod);
+ if (err)
+ goto out_unreg_modinfo_attrs;
+
+ add_sect_attrs(mod, info);
+ add_notes_attrs(mod, info);
+
+ return 0;
+
+out_unreg_modinfo_attrs:
+ module_remove_modinfo_attrs(mod, -1);
+out_unreg_param:
+ module_param_sysfs_remove(mod);
+out_unreg_holders:
+ kobject_put(mod->holders_dir);
+out_unreg:
+ mod_kobject_put(mod);
+out:
+ return err;
+}
+
+static void mod_sysfs_fini(struct module *mod)
+{
+ remove_notes_attrs(mod);
+ remove_sect_attrs(mod);
+ mod_kobject_put(mod);
+}
+
+void mod_sysfs_teardown(struct module *mod)
+{
+ del_usage_links(mod);
+ module_remove_modinfo_attrs(mod, -1);
+ module_param_sysfs_remove(mod);
+ kobject_put(mod->mkobj.drivers_dir);
+ kobject_put(mod->holders_dir);
+ mod_sysfs_fini(mod);
+}
+
+void init_param_lock(struct module *mod)
+{
+ mutex_init(&mod->param_lock);
+}
diff --git a/kernel/module/tracking.c b/kernel/module/tracking.c
new file mode 100644
index 000000000000..7f8133044d09
--- /dev/null
+++ b/kernel/module/tracking.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module taint unload tracking support
+ *
+ * Copyright (C) 2022 Aaron Tomlin
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include "internal.h"
+
+static LIST_HEAD(unloaded_tainted_modules);
+
+int try_add_tainted_module(struct module *mod)
+{
+ struct mod_unload_taint *mod_taint;
+
+ module_assert_mutex_or_preempt();
+
+ list_for_each_entry_rcu(mod_taint, &unloaded_tainted_modules, list,
+ lockdep_is_held(&module_mutex)) {
+ if (!strcmp(mod_taint->name, mod->name) &&
+ mod_taint->taints & mod->taints) {
+ mod_taint->count++;
+ goto out;
+ }
+ }
+
+ mod_taint = kmalloc(sizeof(*mod_taint), GFP_KERNEL);
+ if (unlikely(!mod_taint))
+ return -ENOMEM;
+ strscpy(mod_taint->name, mod->name, MODULE_NAME_LEN);
+ mod_taint->taints = mod->taints;
+ list_add_rcu(&mod_taint->list, &unloaded_tainted_modules);
+ mod_taint->count = 1;
+out:
+ return 0;
+}
+
+void print_unloaded_tainted_modules(void)
+{
+ struct mod_unload_taint *mod_taint;
+ char buf[MODULE_FLAGS_BUF_SIZE];
+
+ if (!list_empty(&unloaded_tainted_modules)) {
+ printk(KERN_DEFAULT "Unloaded tainted modules:");
+ list_for_each_entry_rcu(mod_taint, &unloaded_tainted_modules,
+ list) {
+ size_t l;
+
+ l = module_flags_taint(mod_taint->taints, buf);
+ buf[l++] = '\0';
+ pr_cont(" %s(%s):%llu", mod_taint->name, buf,
+ mod_taint->count);
+ }
+ }
+}
diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c
new file mode 100644
index 000000000000..8ec5cfd60496
--- /dev/null
+++ b/kernel/module/tree_lookup.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Modules tree lookup
+ *
+ * Copyright (C) 2015 Peter Zijlstra
+ * Copyright (C) 2015 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/rbtree_latch.h>
+#include "internal.h"
+
+/*
+ * Use a latched RB-tree for __module_address(); this allows us to use
+ * RCU-sched lookups of the address from any context.
+ *
+ * This is conditional on PERF_EVENTS || TRACING because those can really hit
+ * __module_address() hard by doing a lot of stack unwinding; potentially from
+ * NMI context.
+ */
+
+static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
+{
+ struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
+
+ return (unsigned long)layout->base;
+}
+
+static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n)
+{
+ struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
+
+ return (unsigned long)layout->size;
+}
+
+static __always_inline bool
+mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b)
+{
+ return __mod_tree_val(a) < __mod_tree_val(b);
+}
+
+static __always_inline int
+mod_tree_comp(void *key, struct latch_tree_node *n)
+{
+ unsigned long val = (unsigned long)key;
+ unsigned long start, end;
+
+ start = __mod_tree_val(n);
+ if (val < start)
+ return -1;
+
+ end = start + __mod_tree_size(n);
+ if (val >= end)
+ return 1;
+
+ return 0;
+}
+
+static const struct latch_tree_ops mod_tree_ops = {
+ .less = mod_tree_less,
+ .comp = mod_tree_comp,
+};
+
+static noinline void __mod_tree_insert(struct mod_tree_node *node, struct mod_tree_root *tree)
+{
+ latch_tree_insert(&node->node, &tree->root, &mod_tree_ops);
+}
+
+static void __mod_tree_remove(struct mod_tree_node *node, struct mod_tree_root *tree)
+{
+ latch_tree_erase(&node->node, &tree->root, &mod_tree_ops);
+}
+
+/*
+ * These modifications: insert, remove_init and remove; are serialized by the
+ * module_mutex.
+ */
+void mod_tree_insert(struct module *mod)
+{
+ mod->core_layout.mtn.mod = mod;
+ mod->init_layout.mtn.mod = mod;
+
+ __mod_tree_insert(&mod->core_layout.mtn, &mod_tree);
+ if (mod->init_layout.size)
+ __mod_tree_insert(&mod->init_layout.mtn, &mod_tree);
+
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ mod->data_layout.mtn.mod = mod;
+ __mod_tree_insert(&mod->data_layout.mtn, &mod_data_tree);
+#endif
+}
+
+void mod_tree_remove_init(struct module *mod)
+{
+ if (mod->init_layout.size)
+ __mod_tree_remove(&mod->init_layout.mtn, &mod_tree);
+}
+
+void mod_tree_remove(struct module *mod)
+{
+ __mod_tree_remove(&mod->core_layout.mtn, &mod_tree);
+ mod_tree_remove_init(mod);
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ __mod_tree_remove(&mod->data_layout.mtn, &mod_data_tree);
+#endif
+}
+
+struct module *mod_find(unsigned long addr, struct mod_tree_root *tree)
+{
+ struct latch_tree_node *ltn;
+
+ ltn = latch_tree_find((void *)addr, &tree->root, &mod_tree_ops);
+ if (!ltn)
+ return NULL;
+
+ return container_of(ltn, struct mod_tree_node, node)->mod;
+}
diff --git a/kernel/module/version.c b/kernel/module/version.c
new file mode 100644
index 000000000000..53f43ac5a73e
--- /dev/null
+++ b/kernel/module/version.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module version support
+ *
+ * Copyright (C) 2008 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/printk.h>
+#include "internal.h"
+
+int check_version(const struct load_info *info,
+ const char *symname,
+ struct module *mod,
+ const s32 *crc)
+{
+ Elf_Shdr *sechdrs = info->sechdrs;
+ unsigned int versindex = info->index.vers;
+ unsigned int i, num_versions;
+ struct modversion_info *versions;
+
+ /* Exporting module didn't supply crcs? OK, we're already tainted. */
+ if (!crc)
+ return 1;
+
+ /* No versions at all? modprobe --force does this. */
+ if (versindex == 0)
+ return try_to_force_load(mod, symname) == 0;
+
+ versions = (void *)sechdrs[versindex].sh_addr;
+ num_versions = sechdrs[versindex].sh_size
+ / sizeof(struct modversion_info);
+
+ for (i = 0; i < num_versions; i++) {
+ u32 crcval;
+
+ if (strcmp(versions[i].name, symname) != 0)
+ continue;
+
+ crcval = *crc;
+ if (versions[i].crc == crcval)
+ return 1;
+ pr_debug("Found checksum %X vs module %lX\n",
+ crcval, versions[i].crc);
+ goto bad_version;
+ }
+
+ /* Broken toolchain. Warn once, then let it go.. */
+ pr_warn_once("%s: no symbol version for %s\n", info->name, symname);
+ return 1;
+
+bad_version:
+ pr_warn("%s: disagrees about version of symbol %s\n", info->name, symname);
+ return 0;
+}
+
+int check_modstruct_version(const struct load_info *info,
+ struct module *mod)
+{
+ struct find_symbol_arg fsa = {
+ .name = "module_layout",
+ .gplok = true,
+ };
+
+ /*
+ * Since this should be found in kernel (which can't be removed), no
+ * locking is necessary -- use preempt_disable() to placate lockdep.
+ */
+ preempt_disable();
+ if (!find_symbol(&fsa)) {
+ preempt_enable();
+ BUG();
+ }
+ preempt_enable();
+ return check_version(info, "module_layout", mod, fsa.crc);
+}
+
+/* First part is kernel version, which we ignore if module has crcs. */
+int same_magic(const char *amagic, const char *bmagic,
+ bool has_crcs)
+{
+ if (has_crcs) {
+ amagic += strcspn(amagic, " ");
+ bmagic += strcspn(bmagic, " ");
+ }
+ return strcmp(amagic, bmagic) == 0;
+}
+
+/*
+ * Generate the signature for all relevant module structures here.
+ * If these change, we don't want to try to parse the module.
+ */
+void module_layout(struct module *mod,
+ struct modversion_info *ver,
+ struct kernel_param *kp,
+ struct kernel_symbol *ks,
+ struct tracepoint * const *tp)
+{
+}
+EXPORT_SYMBOL(module_layout);
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
deleted file mode 100644
index 8723ae70ea1f..000000000000
--- a/kernel/module_signing.c
+++ /dev/null
@@ -1,45 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/* Module signature checker
- *
- * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/module.h>
-#include <linux/module_signature.h>
-#include <linux/string.h>
-#include <linux/verification.h>
-#include <crypto/public_key.h>
-#include "module-internal.h"
-
-/*
- * Verify the signature on a module.
- */
-int mod_verify_sig(const void *mod, struct load_info *info)
-{
- struct module_signature ms;
- size_t sig_len, modlen = info->len;
- int ret;
-
- pr_devel("==>%s(,%zu)\n", __func__, modlen);
-
- if (modlen <= sizeof(ms))
- return -EBADMSG;
-
- memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
-
- ret = mod_check_sig(&ms, modlen, "module");
- if (ret)
- return ret;
-
- sig_len = be32_to_cpu(ms.sig_len);
- modlen -= sig_len + sizeof(ms);
- info->len = modlen;
-
- return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
- VERIFY_USE_SECONDARY_KEYRING,
- VERIFYING_MODULE_SIGNATURE,
- NULL, NULL);
-}
diff --git a/kernel/notifier.c b/kernel/notifier.c
index ba005ebf4730..0d5bd62c480e 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -20,7 +20,8 @@ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
*/
static int notifier_chain_register(struct notifier_block **nl,
- struct notifier_block *n)
+ struct notifier_block *n,
+ bool unique_priority)
{
while ((*nl) != NULL) {
if (unlikely((*nl) == n)) {
@@ -30,6 +31,8 @@ static int notifier_chain_register(struct notifier_block **nl,
}
if (n->priority > (*nl)->priority)
break;
+ if (n->priority == (*nl)->priority && unique_priority)
+ return -EBUSY;
nl = &((*nl)->next);
}
n->next = *nl;
@@ -144,13 +147,36 @@ int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
int ret;
spin_lock_irqsave(&nh->lock, flags);
- ret = notifier_chain_register(&nh->head, n);
+ ret = notifier_chain_register(&nh->head, n, false);
spin_unlock_irqrestore(&nh->lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
/**
+ * atomic_notifier_chain_register_unique_prio - Add notifier to an atomic notifier chain
+ * @nh: Pointer to head of the atomic notifier chain
+ * @n: New entry in notifier chain
+ *
+ * Adds a notifier to an atomic notifier chain if there is no other
+ * notifier registered using the same priority.
+ *
+ * Returns 0 on success, %-EEXIST or %-EBUSY on error.
+ */
+int atomic_notifier_chain_register_unique_prio(struct atomic_notifier_head *nh,
+ struct notifier_block *n)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&nh->lock, flags);
+ ret = notifier_chain_register(&nh->head, n, true);
+ spin_unlock_irqrestore(&nh->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(atomic_notifier_chain_register_unique_prio);
+
+/**
* atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
* @nh: Pointer to head of the atomic notifier chain
* @n: Entry to remove from notifier chain
@@ -204,23 +230,27 @@ int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
NOKPROBE_SYMBOL(atomic_notifier_call_chain);
+/**
+ * atomic_notifier_call_chain_is_empty - Check whether notifier chain is empty
+ * @nh: Pointer to head of the atomic notifier chain
+ *
+ * Checks whether notifier chain is empty.
+ *
+ * Returns true is notifier chain is empty, false otherwise.
+ */
+bool atomic_notifier_call_chain_is_empty(struct atomic_notifier_head *nh)
+{
+ return !rcu_access_pointer(nh->head);
+}
+
/*
* Blocking notifier chain routines. All access to the chain is
* synchronized by an rwsem.
*/
-/**
- * blocking_notifier_chain_register - Add notifier to a blocking notifier chain
- * @nh: Pointer to head of the blocking notifier chain
- * @n: New entry in notifier chain
- *
- * Adds a notifier to a blocking notifier chain.
- * Must be called in process context.
- *
- * Returns 0 on success, %-EEXIST on error.
- */
-int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
- struct notifier_block *n)
+static int __blocking_notifier_chain_register(struct blocking_notifier_head *nh,
+ struct notifier_block *n,
+ bool unique_priority)
{
int ret;
@@ -230,16 +260,49 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
* such times we must not call down_write().
*/
if (unlikely(system_state == SYSTEM_BOOTING))
- return notifier_chain_register(&nh->head, n);
+ return notifier_chain_register(&nh->head, n, unique_priority);
down_write(&nh->rwsem);
- ret = notifier_chain_register(&nh->head, n);
+ ret = notifier_chain_register(&nh->head, n, unique_priority);
up_write(&nh->rwsem);
return ret;
}
+
+/**
+ * blocking_notifier_chain_register - Add notifier to a blocking notifier chain
+ * @nh: Pointer to head of the blocking notifier chain
+ * @n: New entry in notifier chain
+ *
+ * Adds a notifier to a blocking notifier chain.
+ * Must be called in process context.
+ *
+ * Returns 0 on success, %-EEXIST on error.
+ */
+int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
+ struct notifier_block *n)
+{
+ return __blocking_notifier_chain_register(nh, n, false);
+}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
/**
+ * blocking_notifier_chain_register_unique_prio - Add notifier to a blocking notifier chain
+ * @nh: Pointer to head of the blocking notifier chain
+ * @n: New entry in notifier chain
+ *
+ * Adds a notifier to an blocking notifier chain if there is no other
+ * notifier registered using the same priority.
+ *
+ * Returns 0 on success, %-EEXIST or %-EBUSY on error.
+ */
+int blocking_notifier_chain_register_unique_prio(struct blocking_notifier_head *nh,
+ struct notifier_block *n)
+{
+ return __blocking_notifier_chain_register(nh, n, true);
+}
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_register_unique_prio);
+
+/**
* blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
* @nh: Pointer to head of the blocking notifier chain
* @n: Entry to remove from notifier chain
@@ -341,7 +404,7 @@ EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
int raw_notifier_chain_register(struct raw_notifier_head *nh,
struct notifier_block *n)
{
- return notifier_chain_register(&nh->head, n);
+ return notifier_chain_register(&nh->head, n, false);
}
EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
@@ -420,10 +483,10 @@ int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
* such times we must not call mutex_lock().
*/
if (unlikely(system_state == SYSTEM_BOOTING))
- return notifier_chain_register(&nh->head, n);
+ return notifier_chain_register(&nh->head, n, false);
mutex_lock(&nh->mutex);
- ret = notifier_chain_register(&nh->head, n);
+ ret = notifier_chain_register(&nh->head, n, false);
mutex_unlock(&nh->mutex);
return ret;
}
diff --git a/kernel/panic.c b/kernel/panic.c
index eb4dfb932c85..a3c758dba15a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -43,12 +43,14 @@
* Should we dump all CPUs backtraces in an oops event?
* Defaults to 0, can be changed via sysctl.
*/
-unsigned int __read_mostly sysctl_oops_all_cpu_backtrace;
+static unsigned int __read_mostly sysctl_oops_all_cpu_backtrace;
+#else
+#define sysctl_oops_all_cpu_backtrace 0
#endif /* CONFIG_SMP */
int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
static unsigned long tainted_mask =
- IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0;
+ IS_ENABLED(CONFIG_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0;
static int pause_on_oops;
static int pause_on_oops_flag;
static DEFINE_SPINLOCK(pause_on_oops_lock);
@@ -73,6 +75,28 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
EXPORT_SYMBOL(panic_notifier_list);
+#if defined(CONFIG_SMP) && defined(CONFIG_SYSCTL)
+static struct ctl_table kern_panic_table[] = {
+ {
+ .procname = "oops_all_cpu_backtrace",
+ .data = &sysctl_oops_all_cpu_backtrace,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ { }
+};
+
+static __init int kernel_panic_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_panic_table);
+ return 0;
+}
+late_initcall(kernel_panic_sysctls_init);
+#endif
+
static long no_blink(int state)
{
return 0;
@@ -579,6 +603,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
{
disable_trace_on_warning();
+ printk_prefer_direct_enter();
+
if (file)
pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
raw_smp_processor_id(), current->pid, file, line,
@@ -608,6 +634,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
/* Just a warning, don't kill lockdep. */
add_taint(taint, LOCKDEP_STILL_OK);
+
+ printk_prefer_direct_exit();
}
#ifndef __WARN_FLAGS
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a46a3723bc66..f4f8cb0435b4 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -52,7 +52,7 @@ static struct kmem_cache *create_pid_cachep(unsigned int level)
/* Name collision forces to do allocation under mutex. */
if (!*pkc)
*pkc = kmem_cache_create(name, len, 0,
- SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, 0);
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
mutex_unlock(&pid_caches_mutex);
/* current can fail, but someone else can succeed. */
return READ_ONCE(*pkc);
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 5899260a8bef..874ad834dc8d 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,6 +1,10 @@
# SPDX-License-Identifier: GPL-2.0
-ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
+ifeq ($(CONFIG_DYNAMIC_DEBUG), y)
+CFLAGS_swap.o := -DDEBUG
+CFLAGS_snapshot.o := -DDEBUG
+CFLAGS_energy_model.o := -DDEBUG
+endif
KASAN_SANITIZE_snapshot.o := n
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 0153b0ca7b23..6c373f2960e7 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -54,28 +54,15 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused)
}
DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
-static int em_debug_units_show(struct seq_file *s, void *unused)
+static int em_debug_flags_show(struct seq_file *s, void *unused)
{
struct em_perf_domain *pd = s->private;
- char *units = (pd->flags & EM_PERF_DOMAIN_MILLIWATTS) ?
- "milliWatts" : "bogoWatts";
- seq_printf(s, "%s\n", units);
+ seq_printf(s, "%#lx\n", pd->flags);
return 0;
}
-DEFINE_SHOW_ATTRIBUTE(em_debug_units);
-
-static int em_debug_skip_inefficiencies_show(struct seq_file *s, void *unused)
-{
- struct em_perf_domain *pd = s->private;
- int enabled = (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES) ? 1 : 0;
-
- seq_printf(s, "%d\n", enabled);
-
- return 0;
-}
-DEFINE_SHOW_ATTRIBUTE(em_debug_skip_inefficiencies);
+DEFINE_SHOW_ATTRIBUTE(em_debug_flags);
static void em_debug_create_pd(struct device *dev)
{
@@ -89,9 +76,8 @@ static void em_debug_create_pd(struct device *dev)
debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus,
&em_debug_cpus_fops);
- debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops);
- debugfs_create_file("skip-inefficiencies", 0444, d, dev->em_pd,
- &em_debug_skip_inefficiencies_fops);
+ debugfs_create_file("flags", 0444, d, dev->em_pd,
+ &em_debug_flags_fops);
/* Create a sub-directory for each performance state */
for (i = 0; i < dev->em_pd->nr_perf_states; i++)
@@ -121,7 +107,8 @@ static void em_debug_remove_pd(struct device *dev) {}
#endif
static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
- int nr_states, struct em_data_callback *cb)
+ int nr_states, struct em_data_callback *cb,
+ unsigned long flags)
{
unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX;
struct em_perf_state *table;
@@ -139,7 +126,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
* lowest performance state of 'dev' above 'freq' and updates
* 'power' and 'freq' accordingly.
*/
- ret = cb->active_power(&power, &freq, dev);
+ ret = cb->active_power(dev, &power, &freq);
if (ret) {
dev_err(dev, "EM: invalid perf. state: %d\n",
ret);
@@ -173,10 +160,22 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
/* Compute the cost of each performance state. */
fmax = (u64) table[nr_states - 1].frequency;
for (i = nr_states - 1; i >= 0; i--) {
- unsigned long power_res = em_scale_power(table[i].power);
+ unsigned long power_res, cost;
+
+ if (flags & EM_PERF_DOMAIN_ARTIFICIAL) {
+ ret = cb->get_cost(dev, table[i].frequency, &cost);
+ if (ret || !cost || cost > EM_MAX_POWER) {
+ dev_err(dev, "EM: invalid cost %lu %d\n",
+ cost, ret);
+ goto free_ps_table;
+ }
+ } else {
+ power_res = em_scale_power(table[i].power);
+ cost = div64_u64(fmax * power_res, table[i].frequency);
+ }
+
+ table[i].cost = cost;
- table[i].cost = div64_u64(fmax * power_res,
- table[i].frequency);
if (table[i].cost >= prev_cost) {
table[i].flags = EM_PERF_STATE_INEFFICIENT;
dev_dbg(dev, "EM: OPP:%lu is inefficient\n",
@@ -197,7 +196,8 @@ free_ps_table:
}
static int em_create_pd(struct device *dev, int nr_states,
- struct em_data_callback *cb, cpumask_t *cpus)
+ struct em_data_callback *cb, cpumask_t *cpus,
+ unsigned long flags)
{
struct em_perf_domain *pd;
struct device *cpu_dev;
@@ -215,7 +215,7 @@ static int em_create_pd(struct device *dev, int nr_states,
return -ENOMEM;
}
- ret = em_create_perf_table(dev, pd, nr_states, cb);
+ ret = em_create_perf_table(dev, pd, nr_states, cb, flags);
if (ret) {
kfree(pd);
return ret;
@@ -259,6 +259,8 @@ static void em_cpufreq_update_efficiencies(struct device *dev)
found++;
}
+ cpufreq_cpu_put(policy);
+
if (!found)
return;
@@ -332,6 +334,7 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
bool milliwatts)
{
unsigned long cap, prev_cap = 0;
+ unsigned long flags = 0;
int cpu, ret;
if (!dev || !nr_states || !cb)
@@ -378,12 +381,16 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
}
}
- ret = em_create_pd(dev, nr_states, cb, cpus);
+ if (milliwatts)
+ flags |= EM_PERF_DOMAIN_MILLIWATTS;
+ else if (cb->get_cost)
+ flags |= EM_PERF_DOMAIN_ARTIFICIAL;
+
+ ret = em_create_pd(dev, nr_states, cb, cpus, flags);
if (ret)
goto unlock;
- if (milliwatts)
- dev->em_pd->flags |= EM_PERF_DOMAIN_MILLIWATTS;
+ dev->em_pd->flags |= flags;
em_cpufreq_update_efficiencies(dev);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 938d5c78b421..20a66bf9f465 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -83,7 +83,7 @@ bool hibernation_available(void)
{
return nohibernate == 0 &&
!security_locked_down(LOCKDOWN_HIBERNATION) &&
- !secretmem_active();
+ !secretmem_active() && !cxl_mem_active();
}
/**
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 7e646079fbeb..e3694034b753 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -127,7 +127,9 @@ static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr,
char *s = buf;
suspend_state_t i;
- for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
+ for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) {
+ if (i >= PM_SUSPEND_MEM && cxl_mem_active())
+ continue;
if (mem_sleep_states[i]) {
const char *label = mem_sleep_states[i];
@@ -136,6 +138,7 @@ static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr,
else
s += sprintf(s, "%s ", label);
}
+ }
/* Convert the last space to a newline if needed. */
if (s != buf)
@@ -545,35 +548,6 @@ static int __init pm_debug_messages_setup(char *str)
}
__setup("pm_debug_messages", pm_debug_messages_setup);
-/**
- * __pm_pr_dbg - Print a suspend debug message to the kernel log.
- * @defer: Whether or not to use printk_deferred() to print the message.
- * @fmt: Message format.
- *
- * The message will be emitted if enabled through the pm_debug_messages
- * sysfs attribute.
- */
-void __pm_pr_dbg(bool defer, const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- if (!pm_debug_messages_on)
- return;
-
- va_start(args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
- if (defer)
- printk_deferred(KERN_DEBUG "PM: %pV", &vaf);
- else
- printk(KERN_DEBUG "PM: %pV", &vaf);
-
- va_end(args);
-}
-
#else /* !CONFIG_PM_SLEEP_DEBUG */
static inline void pm_print_times_init(void) {}
#endif /* CONFIG_PM_SLEEP_DEBUG */
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 11b570fcf049..3068601e585a 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -6,9 +6,6 @@
* Originally from swsusp.
*/
-
-#undef DEBUG
-
#include <linux/interrupt.h>
#include <linux/oom.h>
#include <linux/suspend.h>
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 330d49937692..2a406753af90 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -326,7 +326,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
return ret;
}
-/**
+/*
* Data types related to memory bitmaps.
*
* Memory bitmap is a structure consisting of many linked lists of
@@ -427,6 +427,10 @@ struct memory_bitmap {
/**
* alloc_rtree_node - Allocate a new node and add it to the radix tree.
+ * @gfp_mask: GFP mask for the allocation.
+ * @safe_needed: Get pages not used before hibernation (restore only)
+ * @ca: Pointer to a linked list of pages ("a chain") to allocate from
+ * @list: Radix Tree node to add.
*
* This function is used to allocate inner nodes as well as the
* leave nodes of the radix tree. It also adds the node to the
@@ -902,7 +906,7 @@ static bool rtree_next_node(struct memory_bitmap *bm)
}
/**
- * memory_bm_rtree_next_pfn - Find the next set bit in a memory bitmap.
+ * memory_bm_next_pfn - Find the next set bit in a memory bitmap.
* @bm: Memory bitmap.
*
* Starting from the last returned position this function searches for the next
@@ -1937,7 +1941,7 @@ static inline int get_highmem_buffer(int safe_needed)
}
/**
- * alloc_highmem_image_pages - Allocate some highmem pages for the image.
+ * alloc_highmem_pages - Allocate some highmem pages for the image.
*
* Try to allocate as many pages as needed, but if the number of free highmem
* pages is less than that, allocate them all.
@@ -2224,7 +2228,7 @@ static int check_header(struct swsusp_info *info)
}
/**
- * load header - Check the image header and copy the data from it.
+ * load_header - Check the image header and copy the data from it.
*/
static int load_header(struct swsusp_info *info)
{
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6fcdee7e87a5..827075944d28 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -236,7 +236,8 @@ EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
static bool sleep_state_supported(suspend_state_t state)
{
- return state == PM_SUSPEND_TO_IDLE || valid_state(state);
+ return state == PM_SUSPEND_TO_IDLE ||
+ (valid_state(state) && !cxl_mem_active());
}
static int platform_suspend_prepare(suspend_state_t state)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index da03c15ecc89..ea3dd55709e7 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -224,6 +224,33 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
static int nr_ext_console_drivers;
/*
+ * Used to synchronize printing kthreads against direct printing via
+ * console_trylock/console_unlock.
+ *
+ * Values:
+ * -1 = console kthreads atomically blocked (via global trylock)
+ * 0 = no kthread printing, console not locked (via trylock)
+ * >0 = kthread(s) actively printing
+ *
+ * Note: For synchronizing against direct printing via
+ * console_lock/console_unlock, see the @lock variable in
+ * struct console.
+ */
+static atomic_t console_kthreads_active = ATOMIC_INIT(0);
+
+#define console_kthreads_atomic_tryblock() \
+ (atomic_cmpxchg(&console_kthreads_active, 0, -1) == 0)
+#define console_kthreads_atomic_unblock() \
+ atomic_cmpxchg(&console_kthreads_active, -1, 0)
+#define console_kthreads_atomically_blocked() \
+ (atomic_read(&console_kthreads_active) == -1)
+
+#define console_kthread_printing_tryenter() \
+ atomic_inc_unless_negative(&console_kthreads_active)
+#define console_kthread_printing_exit() \
+ atomic_dec(&console_kthreads_active)
+
+/*
* Helper macros to handle lockdep when locking/unlocking console_sem. We use
* macros instead of functions so that _RET_IP_ contains useful information.
*/
@@ -271,19 +298,49 @@ static bool panic_in_progress(void)
}
/*
- * This is used for debugging the mess that is the VT code by
- * keeping track if we have the console semaphore held. It's
- * definitely not the perfect debug tool (we don't know if _WE_
- * hold it and are racing, but it helps tracking those weird code
- * paths in the console code where we end up in places I want
- * locked without the console semaphore held).
+ * Tracks whether kthread printers are all blocked. A value of true implies
+ * that the console is locked via console_lock() or the console is suspended.
+ * Writing to this variable requires holding @console_sem.
*/
-static int console_locked, console_suspended;
+static bool console_kthreads_blocked;
+
+/*
+ * Block all kthread printers from a schedulable context.
+ *
+ * Requires holding @console_sem.
+ */
+static void console_kthreads_block(void)
+{
+ struct console *con;
+
+ for_each_console(con) {
+ mutex_lock(&con->lock);
+ con->blocked = true;
+ mutex_unlock(&con->lock);
+ }
+
+ console_kthreads_blocked = true;
+}
/*
- * If exclusive_console is non-NULL then only this console is to be printed to.
+ * Unblock all kthread printers from a schedulable context.
+ *
+ * Requires holding @console_sem.
*/
-static struct console *exclusive_console;
+static void console_kthreads_unblock(void)
+{
+ struct console *con;
+
+ for_each_console(con) {
+ mutex_lock(&con->lock);
+ con->blocked = false;
+ mutex_unlock(&con->lock);
+ }
+
+ console_kthreads_blocked = false;
+}
+
+static int console_suspended;
/*
* Array of consoles built from command line options (console=)
@@ -366,7 +423,75 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
/* syslog_lock protects syslog_* variables and write access to clear_seq. */
static DEFINE_MUTEX(syslog_lock);
+/*
+ * A flag to signify if printk_activate_kthreads() has already started the
+ * kthread printers. If true, any later registered consoles must start their
+ * own kthread directly. The flag is write protected by the console_lock.
+ */
+static bool printk_kthreads_available;
+
#ifdef CONFIG_PRINTK
+static atomic_t printk_prefer_direct = ATOMIC_INIT(0);
+
+/**
+ * printk_prefer_direct_enter - cause printk() calls to attempt direct
+ * printing to all enabled consoles
+ *
+ * Since it is not possible to call into the console printing code from any
+ * context, there is no guarantee that direct printing will occur.
+ *
+ * This globally effects all printk() callers.
+ *
+ * Context: Any context.
+ */
+void printk_prefer_direct_enter(void)
+{
+ atomic_inc(&printk_prefer_direct);
+}
+
+/**
+ * printk_prefer_direct_exit - restore printk() behavior
+ *
+ * Context: Any context.
+ */
+void printk_prefer_direct_exit(void)
+{
+ WARN_ON(atomic_dec_if_positive(&printk_prefer_direct) < 0);
+}
+
+/*
+ * Calling printk() always wakes kthread printers so that they can
+ * flush the new message to their respective consoles. Also, if direct
+ * printing is allowed, printk() tries to flush the messages directly.
+ *
+ * Direct printing is allowed in situations when the kthreads
+ * are not available or the system is in a problematic state.
+ *
+ * See the implementation about possible races.
+ */
+static inline bool allow_direct_printing(void)
+{
+ /*
+ * Checking kthread availability is a possible race because the
+ * kthread printers can become permanently disabled during runtime.
+ * However, doing that requires holding the console_lock, so any
+ * pending messages will be direct printed by console_unlock().
+ */
+ if (!printk_kthreads_available)
+ return true;
+
+ /*
+ * Prefer direct printing when the system is in a problematic state.
+ * The context that sets this state will always see the updated value.
+ * The other contexts do not care. Anyway, direct printing is just a
+ * best effort. The direct output is only possible when console_lock
+ * is not already taken and no kthread printers are actively printing.
+ */
+ return (system_state > SYSTEM_RUNNING ||
+ oops_in_progress ||
+ atomic_read(&printk_prefer_direct));
+}
+
DECLARE_WAIT_QUEUE_HEAD(log_wait);
/* All 3 protected by @syslog_lock. */
/* the next printk record to read by syslog(READ) or /proc/kmsg */
@@ -374,12 +499,6 @@ static u64 syslog_seq;
static size_t syslog_partial;
static bool syslog_time;
-/* All 3 protected by @console_sem. */
-/* the next printk record to write to the console */
-static u64 console_seq;
-static u64 exclusive_console_stop_seq;
-static unsigned long console_dropped;
-
struct latched_seq {
seqcount_latch_t latch;
u64 val[2];
@@ -405,6 +524,9 @@ static struct latched_seq clear_seq = {
/* the maximum size of a formatted record (i.e. with prefix added per line) */
#define CONSOLE_LOG_MAX 1024
+/* the maximum size for a dropped text message */
+#define DROPPED_TEXT_MAX 64
+
/* the maximum size allowed to be reserved for a record */
#define LOG_LINE_MAX (CONSOLE_LOG_MAX - PREFIX_MAX)
@@ -746,8 +868,19 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
goto out;
}
+ /*
+ * Guarantee this task is visible on the waitqueue before
+ * checking the wake condition.
+ *
+ * The full memory barrier within set_current_state() of
+ * prepare_to_wait_event() pairs with the full memory barrier
+ * within wq_has_sleeper().
+ *
+ * This pairs with __wake_up_klogd:A.
+ */
ret = wait_event_interruptible(log_wait,
- prb_read_valid(prb, atomic64_read(&user->seq), r));
+ prb_read_valid(prb,
+ atomic64_read(&user->seq), r)); /* LMM(devkmsg_read:A) */
if (ret)
goto out;
}
@@ -1513,7 +1646,18 @@ static int syslog_print(char __user *buf, int size)
seq = syslog_seq;
mutex_unlock(&syslog_lock);
- len = wait_event_interruptible(log_wait, prb_read_valid(prb, seq, NULL));
+ /*
+ * Guarantee this task is visible on the waitqueue before
+ * checking the wake condition.
+ *
+ * The full memory barrier within set_current_state() of
+ * prepare_to_wait_event() pairs with the full memory barrier
+ * within wq_has_sleeper().
+ *
+ * This pairs with __wake_up_klogd:A.
+ */
+ len = wait_event_interruptible(log_wait,
+ prb_read_valid(prb, seq, NULL)); /* LMM(syslog_print:A) */
mutex_lock(&syslog_lock);
if (len)
@@ -1911,47 +2055,24 @@ static int console_trylock_spinning(void)
}
/*
- * Call the console drivers, asking them to write out
- * log_buf[start] to log_buf[end - 1].
- * The console_lock must be held.
+ * Call the specified console driver, asking it to write out the specified
+ * text and length. If @dropped_text is non-NULL and any records have been
+ * dropped, a dropped message will be written out first.
*/
-static void call_console_drivers(const char *ext_text, size_t ext_len,
- const char *text, size_t len)
+static void call_console_driver(struct console *con, const char *text, size_t len,
+ char *dropped_text)
{
- static char dropped_text[64];
- size_t dropped_len = 0;
- struct console *con;
-
- trace_console_rcuidle(text, len);
+ size_t dropped_len;
- if (!console_drivers)
- return;
-
- if (console_dropped) {
- dropped_len = snprintf(dropped_text, sizeof(dropped_text),
+ if (con->dropped && dropped_text) {
+ dropped_len = snprintf(dropped_text, DROPPED_TEXT_MAX,
"** %lu printk messages dropped **\n",
- console_dropped);
- console_dropped = 0;
+ con->dropped);
+ con->dropped = 0;
+ con->write(con, dropped_text, dropped_len);
}
- for_each_console(con) {
- if (exclusive_console && con != exclusive_console)
- continue;
- if (!(con->flags & CON_ENABLED))
- continue;
- if (!con->write)
- continue;
- if (!cpu_online(smp_processor_id()) &&
- !(con->flags & CON_ANYTIME))
- continue;
- if (con->flags & CON_EXTENDED)
- con->write(con, ext_text, ext_len);
- else {
- if (dropped_len)
- con->write(con, dropped_text, dropped_len);
- con->write(con, text, len);
- }
- }
+ con->write(con, text, len);
}
/*
@@ -2026,8 +2147,10 @@ static u8 *__printk_recursion_counter(void)
int printk_delay_msec __read_mostly;
-static inline void printk_delay(void)
+static inline void printk_delay(int level)
{
+ boot_delay_msec(level);
+
if (unlikely(printk_delay_msec)) {
int m = printk_delay_msec;
@@ -2041,7 +2164,7 @@ static inline void printk_delay(void)
static inline u32 printk_caller_id(void)
{
return in_task() ? task_pid_nr(current) :
- 0x80000000 + raw_smp_processor_id();
+ 0x80000000 + smp_processor_id();
}
/**
@@ -2115,6 +2238,8 @@ static u16 printk_sprint(char *text, u16 size, int facility,
}
}
+ trace_console_rcuidle(text, text_len);
+
return text_len;
}
@@ -2123,7 +2248,6 @@ int vprintk_store(int facility, int level,
const struct dev_printk_info *dev_info,
const char *fmt, va_list args)
{
- const u32 caller_id = printk_caller_id();
struct prb_reserved_entry e;
enum printk_info_flags flags = 0;
struct printk_record r;
@@ -2133,10 +2257,14 @@ int vprintk_store(int facility, int level,
u8 *recursion_ptr;
u16 reserve_size;
va_list args2;
+ u32 caller_id;
u16 text_len;
int ret = 0;
u64 ts_nsec;
+ if (!printk_enter_irqsave(recursion_ptr, irqflags))
+ return 0;
+
/*
* Since the duration of printk() can vary depending on the message
* and state of the ringbuffer, grab the timestamp now so that it is
@@ -2145,8 +2273,7 @@ int vprintk_store(int facility, int level,
*/
ts_nsec = local_clock();
- if (!printk_enter_irqsave(recursion_ptr, irqflags))
- return 0;
+ caller_id = printk_caller_id();
/*
* The sprintf needs to come first since the syslog prefix might be
@@ -2250,23 +2377,25 @@ asmlinkage int vprintk_emit(int facility, int level,
in_sched = true;
}
- boot_delay_msec(level);
- printk_delay();
+ printk_delay(level);
printed_len = vprintk_store(facility, level, dev_info, fmt, args);
/* If called from the scheduler, we can not call up(). */
- if (!in_sched) {
+ if (!in_sched && allow_direct_printing()) {
/*
- * Disable preemption to avoid being preempted while holding
- * console_sem which would prevent anyone from printing to
- * console
+ * The caller may be holding system-critical or
+ * timing-sensitive locks. Disable preemption during direct
+ * printing of all remaining records to all consoles so that
+ * this context can return as soon as possible. Hopefully
+ * another printk() caller will take over the printing.
*/
preempt_disable();
/*
* Try to acquire and then immediately release the console
- * semaphore. The release will print out buffers and wake up
- * /dev/kmsg and syslog() users.
+ * semaphore. The release will print out buffers. With the
+ * spinning variant, this context tries to take over the
+ * printing from another printing context.
*/
if (console_trylock_spinning())
console_unlock();
@@ -2297,18 +2426,21 @@ asmlinkage __visible int _printk(const char *fmt, ...)
}
EXPORT_SYMBOL(_printk);
+static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress);
+
+static void printk_start_kthread(struct console *con);
+
#else /* CONFIG_PRINTK */
#define CONSOLE_LOG_MAX 0
+#define DROPPED_TEXT_MAX 0
#define printk_time false
#define prb_read_valid(rb, seq, r) false
#define prb_first_valid_seq(rb) 0
+#define prb_next_seq(rb) 0
static u64 syslog_seq;
-static u64 console_seq;
-static u64 exclusive_console_stop_seq;
-static unsigned long console_dropped;
static size_t record_print_text(const struct printk_record *r,
bool syslog, bool time)
@@ -2325,9 +2457,14 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,
struct dev_printk_info *dev_info) { return 0; }
static void console_lock_spinning_enable(void) { }
static int console_lock_spinning_disable_and_check(void) { return 0; }
-static void call_console_drivers(const char *ext_text, size_t ext_len,
- const char *text, size_t len) {}
+static void call_console_driver(struct console *con, const char *text, size_t len,
+ char *dropped_text)
+{
+}
static bool suppress_message_printing(int level) { return false; }
+static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; }
+static void printk_start_kthread(struct console *con) { }
+static bool allow_direct_printing(void) { return true; }
#endif /* CONFIG_PRINTK */
@@ -2515,6 +2652,7 @@ void suspend_console(void)
if (!console_suspend_enabled)
return;
pr_info("Suspending console(s) (use no_console_suspend to debug)\n");
+ pr_flush(1000, true);
console_lock();
console_suspended = 1;
up_console_sem();
@@ -2527,6 +2665,7 @@ void resume_console(void)
down_console_sem();
console_suspended = 0;
console_unlock();
+ pr_flush(1000, true);
}
/**
@@ -2544,6 +2683,14 @@ static int console_cpu_notify(unsigned int cpu)
/* If trylock fails, someone else is doing the printing */
if (console_trylock())
console_unlock();
+ else {
+ /*
+ * If a new CPU comes online, the conditions for
+ * printer_should_wake() may have changed for some
+ * kthread printer with !CON_ANYTIME.
+ */
+ wake_up_klogd();
+ }
}
return 0;
}
@@ -2563,7 +2710,7 @@ void console_lock(void)
down_console_sem();
if (console_suspended)
return;
- console_locked = 1;
+ console_kthreads_block();
console_may_schedule = 1;
}
EXPORT_SYMBOL(console_lock);
@@ -2584,33 +2731,32 @@ int console_trylock(void)
up_console_sem();
return 0;
}
- console_locked = 1;
+ if (!console_kthreads_atomic_tryblock()) {
+ up_console_sem();
+ return 0;
+ }
console_may_schedule = 0;
return 1;
}
EXPORT_SYMBOL(console_trylock);
-int is_console_locked(void)
-{
- return console_locked;
-}
-EXPORT_SYMBOL(is_console_locked);
-
/*
- * Check if we have any console that is capable of printing while cpu is
- * booting or shutting down. Requires console_sem.
+ * This is used to help to make sure that certain paths within the VT code are
+ * running with the console lock held. It is definitely not the perfect debug
+ * tool (it is not known if the VT code is the task holding the console lock),
+ * but it helps tracking those weird code paths in the console code such as
+ * when the console is suspended: where the console is not locked but no
+ * console printing may occur.
+ *
+ * Note: This returns true when the console is suspended but is not locked.
+ * This is intentional because the VT code must consider that situation
+ * the same as if the console was locked.
*/
-static int have_callable_console(void)
+int is_console_locked(void)
{
- struct console *con;
-
- for_each_console(con)
- if ((con->flags & CON_ENABLED) &&
- (con->flags & CON_ANYTIME))
- return 1;
-
- return 0;
+ return (console_kthreads_blocked || atomic_read(&console_kthreads_active));
}
+EXPORT_SYMBOL(is_console_locked);
/*
* Return true when this CPU should unlock console_sem without pushing all
@@ -2631,132 +2777,125 @@ static bool abandon_console_lock_in_panic(void)
return atomic_read(&panic_cpu) != raw_smp_processor_id();
}
+static inline bool __console_is_usable(short flags)
+{
+ if (!(flags & CON_ENABLED))
+ return false;
+
+ /*
+ * Console drivers may assume that per-cpu resources have been
+ * allocated. So unless they're explicitly marked as being able to
+ * cope (CON_ANYTIME) don't call them until this CPU is officially up.
+ */
+ if (!cpu_online(raw_smp_processor_id()) &&
+ !(flags & CON_ANYTIME))
+ return false;
+
+ return true;
+}
+
/*
- * Can we actually use the console at this time on this cpu?
+ * Check if the given console is currently capable and allowed to print
+ * records.
*
- * Console drivers may assume that per-cpu resources have been allocated. So
- * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
- * call them until this CPU is officially up.
+ * Requires holding the console_lock.
*/
-static inline int can_use_console(void)
+static inline bool console_is_usable(struct console *con)
{
- return cpu_online(raw_smp_processor_id()) || have_callable_console();
+ if (!con->write)
+ return false;
+
+ return __console_is_usable(con->flags);
}
-/**
- * console_unlock - unlock the console system
+static void __console_unlock(void)
+{
+ /*
+ * Depending on whether console_lock() or console_trylock() was used,
+ * appropriately allow the kthread printers to continue.
+ */
+ if (console_kthreads_blocked)
+ console_kthreads_unblock();
+ else
+ console_kthreads_atomic_unblock();
+
+ /*
+ * New records may have arrived while the console was locked.
+ * Wake the kthread printers to print them.
+ */
+ wake_up_klogd();
+
+ up_console_sem();
+}
+
+/*
+ * Print one record for the given console. The record printed is whatever
+ * record is the next available record for the given console.
*
- * Releases the console_lock which the caller holds on the console system
- * and the console driver list.
+ * @text is a buffer of size CONSOLE_LOG_MAX.
*
- * While the console_lock was held, console output may have been buffered
- * by printk(). If this is the case, console_unlock(); emits
- * the output prior to releasing the lock.
+ * If extended messages should be printed, @ext_text is a buffer of size
+ * CONSOLE_EXT_LOG_MAX. Otherwise @ext_text must be NULL.
*
- * If there is output waiting, we wake /dev/kmsg and syslog() users.
+ * If dropped messages should be printed, @dropped_text is a buffer of size
+ * DROPPED_TEXT_MAX. Otherwise @dropped_text must be NULL.
*
- * console_unlock(); may be called from any context.
+ * @handover will be set to true if a printk waiter has taken over the
+ * console_lock, in which case the caller is no longer holding the
+ * console_lock. Otherwise it is set to false. A NULL pointer may be provided
+ * to disable allowing the console_lock to be taken over by a printk waiter.
+ *
+ * Returns false if the given console has no next record to print, otherwise
+ * true.
+ *
+ * Requires the console_lock if @handover is non-NULL.
+ * Requires con->lock otherwise.
*/
-void console_unlock(void)
+static bool __console_emit_next_record(struct console *con, char *text, char *ext_text,
+ char *dropped_text, bool *handover)
{
- static char ext_text[CONSOLE_EXT_LOG_MAX];
- static char text[CONSOLE_LOG_MAX];
- static int panic_console_dropped;
- unsigned long flags;
- bool do_cond_resched, retry;
+ static atomic_t panic_console_dropped = ATOMIC_INIT(0);
struct printk_info info;
struct printk_record r;
- u64 __maybe_unused next_seq;
-
- if (console_suspended) {
- up_console_sem();
- return;
- }
-
- prb_rec_init_rd(&r, &info, text, sizeof(text));
-
- /*
- * Console drivers are called with interrupts disabled, so
- * @console_may_schedule should be cleared before; however, we may
- * end up dumping a lot of lines, for example, if called from
- * console registration path, and should invoke cond_resched()
- * between lines if allowable. Not doing so can cause a very long
- * scheduling stall on a slow console leading to RCU stall and
- * softlockup warnings which exacerbate the issue with more
- * messages practically incapacitating the system.
- *
- * console_trylock() is not able to detect the preemptive
- * context reliably. Therefore the value must be stored before
- * and cleared after the "again" goto label.
- */
- do_cond_resched = console_may_schedule;
-again:
- console_may_schedule = 0;
-
- /*
- * We released the console_sem lock, so we need to recheck if
- * cpu is online and (if not) is there at least one CON_ANYTIME
- * console.
- */
- if (!can_use_console()) {
- console_locked = 0;
- up_console_sem();
- return;
- }
+ unsigned long flags;
+ char *write_text;
+ size_t len;
- for (;;) {
- size_t ext_len = 0;
- int handover;
- size_t len;
+ prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);
-skip:
- if (!prb_read_valid(prb, console_seq, &r))
- break;
+ if (handover)
+ *handover = false;
- if (console_seq != r.info->seq) {
- console_dropped += r.info->seq - console_seq;
- console_seq = r.info->seq;
- if (panic_in_progress() && panic_console_dropped++ > 10) {
- suppress_panic_printk = 1;
- pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n");
- }
- }
+ if (!prb_read_valid(prb, con->seq, &r))
+ return false;
- if (suppress_message_printing(r.info->level)) {
- /*
- * Skip record we have buffered and already printed
- * directly to the console when we received it, and
- * record that has level above the console loglevel.
- */
- console_seq++;
- goto skip;
+ if (con->seq != r.info->seq) {
+ con->dropped += r.info->seq - con->seq;
+ con->seq = r.info->seq;
+ if (panic_in_progress() &&
+ atomic_fetch_inc_relaxed(&panic_console_dropped) > 10) {
+ suppress_panic_printk = 1;
+ pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n");
}
+ }
- /* Output to all consoles once old messages replayed. */
- if (unlikely(exclusive_console &&
- console_seq >= exclusive_console_stop_seq)) {
- exclusive_console = NULL;
- }
+ /* Skip record that has level above the console loglevel. */
+ if (suppress_message_printing(r.info->level)) {
+ con->seq++;
+ goto skip;
+ }
- /*
- * Handle extended console text first because later
- * record_print_text() will modify the record buffer in-place.
- */
- if (nr_ext_console_drivers) {
- ext_len = info_print_ext_header(ext_text,
- sizeof(ext_text),
- r.info);
- ext_len += msg_print_ext_body(ext_text + ext_len,
- sizeof(ext_text) - ext_len,
- &r.text_buf[0],
- r.info->text_len,
- &r.info->dev_info);
- }
- len = record_print_text(&r,
- console_msg_format & MSG_FORMAT_SYSLOG,
- printk_time);
- console_seq++;
+ if (ext_text) {
+ write_text = ext_text;
+ len = info_print_ext_header(ext_text, CONSOLE_EXT_LOG_MAX, r.info);
+ len += msg_print_ext_body(ext_text + len, CONSOLE_EXT_LOG_MAX - len,
+ &r.text_buf[0], r.info->text_len, &r.info->dev_info);
+ } else {
+ write_text = text;
+ len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time);
+ }
+ if (handover) {
/*
* While actively printing out messages, if another printk()
* were to occur on another CPU, it may wait for this one to
@@ -2770,38 +2909,187 @@ skip:
printk_safe_enter_irqsave(flags);
console_lock_spinning_enable();
- stop_critical_timings(); /* don't trace print latency */
- call_console_drivers(ext_text, ext_len, text, len);
- start_critical_timings();
+ /* don't trace irqsoff print latency */
+ stop_critical_timings();
+ }
- handover = console_lock_spinning_disable_and_check();
- printk_safe_exit_irqrestore(flags);
- if (handover)
- return;
+ call_console_driver(con, write_text, len, dropped_text);
- /* Allow panic_cpu to take over the consoles safely */
- if (abandon_console_lock_in_panic())
- break;
+ con->seq++;
+
+ if (handover) {
+ start_critical_timings();
+ *handover = console_lock_spinning_disable_and_check();
+ printk_safe_exit_irqrestore(flags);
+ }
+skip:
+ return true;
+}
- if (do_cond_resched)
- cond_resched();
+/*
+ * Print a record for a given console, but allow another printk() caller to
+ * take over the console_lock and continue printing.
+ *
+ * Requires the console_lock, but depending on @handover after the call, the
+ * caller may no longer have the console_lock.
+ *
+ * See __console_emit_next_record() for argument and return details.
+ */
+static bool console_emit_next_record_transferable(struct console *con, char *text, char *ext_text,
+ char *dropped_text, bool *handover)
+{
+ /*
+ * Handovers are only supported if threaded printers are atomically
+ * blocked. The context taking over the console_lock may be atomic.
+ */
+ if (!console_kthreads_atomically_blocked()) {
+ *handover = false;
+ handover = NULL;
}
- /* Get consistent value of the next-to-be-used sequence number. */
- next_seq = console_seq;
+ return __console_emit_next_record(con, text, ext_text, dropped_text, handover);
+}
- console_locked = 0;
- up_console_sem();
+/*
+ * Print out all remaining records to all consoles.
+ *
+ * @do_cond_resched is set by the caller. It can be true only in schedulable
+ * context.
+ *
+ * @next_seq is set to the sequence number after the last available record.
+ * The value is valid only when this function returns true. It means that all
+ * usable consoles are completely flushed.
+ *
+ * @handover will be set to true if a printk waiter has taken over the
+ * console_lock, in which case the caller is no longer holding the
+ * console_lock. Otherwise it is set to false.
+ *
+ * Returns true when there was at least one usable console and all messages
+ * were flushed to all usable consoles. A returned false informs the caller
+ * that everything was not flushed (either there were no usable consoles or
+ * another context has taken over printing or it is a panic situation and this
+ * is not the panic CPU or direct printing is not preferred). Regardless the
+ * reason, the caller should assume it is not useful to immediately try again.
+ *
+ * Requires the console_lock.
+ */
+static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover)
+{
+ static char dropped_text[DROPPED_TEXT_MAX];
+ static char ext_text[CONSOLE_EXT_LOG_MAX];
+ static char text[CONSOLE_LOG_MAX];
+ bool any_usable = false;
+ struct console *con;
+ bool any_progress;
+
+ *next_seq = 0;
+ *handover = false;
+
+ do {
+ /* Let the kthread printers do the work if they can. */
+ if (!allow_direct_printing())
+ return false;
+
+ any_progress = false;
+
+ for_each_console(con) {
+ bool progress;
+
+ if (!console_is_usable(con))
+ continue;
+ any_usable = true;
+
+ if (con->flags & CON_EXTENDED) {
+ /* Extended consoles do not print "dropped messages". */
+ progress = console_emit_next_record_transferable(con, &text[0],
+ &ext_text[0], NULL, handover);
+ } else {
+ progress = console_emit_next_record_transferable(con, &text[0],
+ NULL, &dropped_text[0], handover);
+ }
+ if (*handover)
+ return false;
+
+ /* Track the next of the highest seq flushed. */
+ if (con->seq > *next_seq)
+ *next_seq = con->seq;
+
+ if (!progress)
+ continue;
+ any_progress = true;
+
+ /* Allow panic_cpu to take over the consoles safely. */
+ if (abandon_console_lock_in_panic())
+ return false;
+
+ if (do_cond_resched)
+ cond_resched();
+ }
+ } while (any_progress);
+
+ return any_usable;
+}
+
+/**
+ * console_unlock - unlock the console system
+ *
+ * Releases the console_lock which the caller holds on the console system
+ * and the console driver list.
+ *
+ * While the console_lock was held, console output may have been buffered
+ * by printk(). If this is the case, console_unlock(); emits
+ * the output prior to releasing the lock.
+ *
+ * console_unlock(); may be called from any context.
+ */
+void console_unlock(void)
+{
+ bool do_cond_resched;
+ bool handover;
+ bool flushed;
+ u64 next_seq;
+
+ if (console_suspended) {
+ up_console_sem();
+ return;
+ }
/*
- * Someone could have filled up the buffer again, so re-check if there's
- * something to flush. In case we cannot trylock the console_sem again,
- * there's a new owner and the console_unlock() from them will do the
- * flush, no worries.
+ * Console drivers are called with interrupts disabled, so
+ * @console_may_schedule should be cleared before; however, we may
+ * end up dumping a lot of lines, for example, if called from
+ * console registration path, and should invoke cond_resched()
+ * between lines if allowable. Not doing so can cause a very long
+ * scheduling stall on a slow console leading to RCU stall and
+ * softlockup warnings which exacerbate the issue with more
+ * messages practically incapacitating the system. Therefore, create
+ * a local to use for the printing loop.
*/
- retry = prb_read_valid(prb, next_seq, NULL);
- if (retry && !abandon_console_lock_in_panic() && console_trylock())
- goto again;
+ do_cond_resched = console_may_schedule;
+
+ do {
+ console_may_schedule = 0;
+
+ flushed = console_flush_all(do_cond_resched, &next_seq, &handover);
+ if (!handover)
+ __console_unlock();
+
+ /*
+ * Abort if there was a failure to flush all messages to all
+ * usable consoles. Either it is not possible to flush (in
+ * which case it would be an infinite loop of retrying) or
+ * another context has taken over printing.
+ */
+ if (!flushed)
+ break;
+
+ /*
+ * Some context may have added new records after
+ * console_flush_all() but before unlocking the console.
+ * Re-check if there is a new record to flush. If the trylock
+ * fails, another context is already handling the printing.
+ */
+ } while (prb_read_valid(prb, next_seq, NULL) && console_trylock());
}
EXPORT_SYMBOL(console_unlock);
@@ -2832,15 +3120,21 @@ void console_unblank(void)
if (oops_in_progress) {
if (down_trylock_console_sem() != 0)
return;
+ if (!console_kthreads_atomic_tryblock()) {
+ up_console_sem();
+ return;
+ }
} else
console_lock();
- console_locked = 1;
console_may_schedule = 0;
for_each_console(c)
if ((c->flags & CON_ENABLED) && c->unblank)
c->unblank();
console_unlock();
+
+ if (!oops_in_progress)
+ pr_flush(1000, true);
}
/**
@@ -2861,8 +3155,14 @@ void console_flush_on_panic(enum con_flush_mode mode)
console_trylock();
console_may_schedule = 0;
- if (mode == CONSOLE_REPLAY_ALL)
- console_seq = prb_first_valid_seq(prb);
+ if (mode == CONSOLE_REPLAY_ALL) {
+ struct console *c;
+ u64 seq;
+
+ seq = prb_first_valid_seq(prb);
+ for_each_console(c)
+ c->seq = seq;
+ }
console_unlock();
}
@@ -2893,6 +3193,7 @@ struct tty_driver *console_device(int *index)
*/
void console_stop(struct console *console)
{
+ __pr_flush(console, 1000, true);
console_lock();
console->flags &= ~CON_ENABLED;
console_unlock();
@@ -2904,6 +3205,7 @@ void console_start(struct console *console)
console_lock();
console->flags |= CON_ENABLED;
console_unlock();
+ __pr_flush(console, 1000, true);
}
EXPORT_SYMBOL(console_start);
@@ -2990,6 +3292,11 @@ static void try_enable_default_console(struct console *newcon)
newcon->flags |= CON_CONSDEV;
}
+#define con_printk(lvl, con, fmt, ...) \
+ printk(lvl pr_fmt("%sconsole [%s%d] " fmt), \
+ (con->flags & CON_BOOT) ? "boot" : "", \
+ con->name, con->index, ##__VA_ARGS__)
+
/*
* The console driver calls this routine during kernel initialization
* to register the console printing procedure with printk() and to
@@ -3097,27 +3404,24 @@ void register_console(struct console *newcon)
if (newcon->flags & CON_EXTENDED)
nr_ext_console_drivers++;
- if (newcon->flags & CON_PRINTBUFFER) {
- /*
- * console_unlock(); will print out the buffered messages
- * for us.
- *
- * We're about to replay the log buffer. Only do this to the
- * just-registered console to avoid excessive message spam to
- * the already-registered consoles.
- *
- * Set exclusive_console with disabled interrupts to reduce
- * race window with eventual console_flush_on_panic() that
- * ignores console_lock.
- */
- exclusive_console = newcon;
- exclusive_console_stop_seq = console_seq;
+ newcon->dropped = 0;
+ newcon->thread = NULL;
+ newcon->blocked = true;
+ mutex_init(&newcon->lock);
+ if (newcon->flags & CON_PRINTBUFFER) {
/* Get a consistent copy of @syslog_seq. */
mutex_lock(&syslog_lock);
- console_seq = syslog_seq;
+ newcon->seq = syslog_seq;
mutex_unlock(&syslog_lock);
+ } else {
+ /* Begin with next message. */
+ newcon->seq = prb_next_seq(prb);
}
+
+ if (printk_kthreads_available)
+ printk_start_kthread(newcon);
+
console_unlock();
console_sysfs_notify();
@@ -3128,9 +3432,7 @@ void register_console(struct console *newcon)
* users know there might be something in the kernel's log buffer that
* went to the bootconsole (that they do not see on the real console)
*/
- pr_info("%sconsole [%s%d] enabled\n",
- (newcon->flags & CON_BOOT) ? "boot" : "" ,
- newcon->name, newcon->index);
+ con_printk(KERN_INFO, newcon, "enabled\n");
if (bootcon_enabled &&
((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
!keep_bootcon) {
@@ -3146,12 +3448,11 @@ EXPORT_SYMBOL(register_console);
int unregister_console(struct console *console)
{
+ struct task_struct *thd;
struct console *con;
int res;
- pr_info("%sconsole [%s%d] disabled\n",
- (console->flags & CON_BOOT) ? "boot" : "" ,
- console->name, console->index);
+ con_printk(KERN_INFO, console, "disabled\n");
res = _braille_unregister_console(console);
if (res < 0)
@@ -3188,7 +3489,20 @@ int unregister_console(struct console *console)
console_drivers->flags |= CON_CONSDEV;
console->flags &= ~CON_ENABLED;
+
+ /*
+ * console->thread can only be cleared under the console lock. But
+ * stopping the thread must be done without the console lock. The
+ * task that clears @thread is the task that stops the kthread.
+ */
+ thd = console->thread;
+ console->thread = NULL;
+
console_unlock();
+
+ if (thd)
+ kthread_stop(thd);
+
console_sysfs_notify();
if (console->exit)
@@ -3284,12 +3598,294 @@ static int __init printk_late_init(void)
}
late_initcall(printk_late_init);
+static int __init printk_activate_kthreads(void)
+{
+ struct console *con;
+
+ console_lock();
+ printk_kthreads_available = true;
+ for_each_console(con)
+ printk_start_kthread(con);
+ console_unlock();
+
+ return 0;
+}
+early_initcall(printk_activate_kthreads);
+
#if defined CONFIG_PRINTK
+/* If @con is specified, only wait for that console. Otherwise wait for all. */
+static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress)
+{
+ int remaining = timeout_ms;
+ struct console *c;
+ u64 last_diff = 0;
+ u64 printk_seq;
+ u64 diff;
+ u64 seq;
+
+ might_sleep();
+
+ seq = prb_next_seq(prb);
+
+ for (;;) {
+ diff = 0;
+
+ console_lock();
+ for_each_console(c) {
+ if (con && con != c)
+ continue;
+ if (!console_is_usable(c))
+ continue;
+ printk_seq = c->seq;
+ if (printk_seq < seq)
+ diff += seq - printk_seq;
+ }
+ console_unlock();
+
+ if (diff != last_diff && reset_on_progress)
+ remaining = timeout_ms;
+
+ if (diff == 0 || remaining == 0)
+ break;
+
+ if (remaining < 0) {
+ /* no timeout limit */
+ msleep(100);
+ } else if (remaining < 100) {
+ msleep(remaining);
+ remaining = 0;
+ } else {
+ msleep(100);
+ remaining -= 100;
+ }
+
+ last_diff = diff;
+ }
+
+ return (diff == 0);
+}
+
+/**
+ * pr_flush() - Wait for printing threads to catch up.
+ *
+ * @timeout_ms: The maximum time (in ms) to wait.
+ * @reset_on_progress: Reset the timeout if forward progress is seen.
+ *
+ * A value of 0 for @timeout_ms means no waiting will occur. A value of -1
+ * represents infinite waiting.
+ *
+ * If @reset_on_progress is true, the timeout will be reset whenever any
+ * printer has been seen to make some forward progress.
+ *
+ * Context: Process context. May sleep while acquiring console lock.
+ * Return: true if all enabled printers are caught up.
+ */
+bool pr_flush(int timeout_ms, bool reset_on_progress)
+{
+ return __pr_flush(NULL, timeout_ms, reset_on_progress);
+}
+EXPORT_SYMBOL(pr_flush);
+
+static void __printk_fallback_preferred_direct(void)
+{
+ printk_prefer_direct_enter();
+ pr_err("falling back to preferred direct printing\n");
+ printk_kthreads_available = false;
+}
+
+/*
+ * Enter preferred direct printing, but never exit. Mark console threads as
+ * unavailable. The system is then forever in preferred direct printing and
+ * any printing threads will exit.
+ *
+ * Must *not* be called under console_lock. Use
+ * __printk_fallback_preferred_direct() if already holding console_lock.
+ */
+static void printk_fallback_preferred_direct(void)
+{
+ console_lock();
+ __printk_fallback_preferred_direct();
+ console_unlock();
+}
+
+/*
+ * Print a record for a given console, not allowing another printk() caller
+ * to take over. This is appropriate for contexts that do not have the
+ * console_lock.
+ *
+ * See __console_emit_next_record() for argument and return details.
+ */
+static bool console_emit_next_record(struct console *con, char *text, char *ext_text,
+ char *dropped_text)
+{
+ return __console_emit_next_record(con, text, ext_text, dropped_text, NULL);
+}
+
+static bool printer_should_wake(struct console *con, u64 seq)
+{
+ short flags;
+
+ if (kthread_should_stop() || !printk_kthreads_available)
+ return true;
+
+ if (con->blocked ||
+ console_kthreads_atomically_blocked()) {
+ return false;
+ }
+
+ /*
+ * This is an unsafe read from con->flags, but a false positive is
+ * not a problem. Worst case it would allow the printer to wake up
+ * although it is disabled. But the printer will notice that when
+ * attempting to print and instead go back to sleep.
+ */
+ flags = data_race(READ_ONCE(con->flags));
+
+ if (!__console_is_usable(flags))
+ return false;
+
+ return prb_read_valid(prb, seq, NULL);
+}
+
+static int printk_kthread_func(void *data)
+{
+ struct console *con = data;
+ char *dropped_text = NULL;
+ char *ext_text = NULL;
+ u64 seq = 0;
+ char *text;
+ int error;
+
+ text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL);
+ if (!text) {
+ con_printk(KERN_ERR, con, "failed to allocate text buffer\n");
+ printk_fallback_preferred_direct();
+ goto out;
+ }
+
+ if (con->flags & CON_EXTENDED) {
+ ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL);
+ if (!ext_text) {
+ con_printk(KERN_ERR, con, "failed to allocate ext_text buffer\n");
+ printk_fallback_preferred_direct();
+ goto out;
+ }
+ } else {
+ dropped_text = kmalloc(DROPPED_TEXT_MAX, GFP_KERNEL);
+ if (!dropped_text) {
+ con_printk(KERN_ERR, con, "failed to allocate dropped_text buffer\n");
+ printk_fallback_preferred_direct();
+ goto out;
+ }
+ }
+
+ con_printk(KERN_INFO, con, "printing thread started\n");
+
+ for (;;) {
+ /*
+ * Guarantee this task is visible on the waitqueue before
+ * checking the wake condition.
+ *
+ * The full memory barrier within set_current_state() of
+ * prepare_to_wait_event() pairs with the full memory barrier
+ * within wq_has_sleeper().
+ *
+ * This pairs with __wake_up_klogd:A.
+ */
+ error = wait_event_interruptible(log_wait,
+ printer_should_wake(con, seq)); /* LMM(printk_kthread_func:A) */
+
+ if (kthread_should_stop() || !printk_kthreads_available)
+ break;
+
+ if (error)
+ continue;
+
+ error = mutex_lock_interruptible(&con->lock);
+ if (error)
+ continue;
+
+ if (con->blocked ||
+ !console_kthread_printing_tryenter()) {
+ /* Another context has locked the console_lock. */
+ mutex_unlock(&con->lock);
+ continue;
+ }
+
+ /*
+ * Although this context has not locked the console_lock, it
+ * is known that the console_lock is not locked and it is not
+ * possible for any other context to lock the console_lock.
+ * Therefore it is safe to read con->flags.
+ */
+
+ if (!__console_is_usable(con->flags)) {
+ console_kthread_printing_exit();
+ mutex_unlock(&con->lock);
+ continue;
+ }
+
+ /*
+ * Even though the printk kthread is always preemptible, it is
+ * still not allowed to call cond_resched() from within
+ * console drivers. The task may become non-preemptible in the
+ * console driver call chain. For example, vt_console_print()
+ * takes a spinlock and then can call into fbcon_redraw(),
+ * which can conditionally invoke cond_resched().
+ */
+ console_may_schedule = 0;
+ console_emit_next_record(con, text, ext_text, dropped_text);
+
+ seq = con->seq;
+
+ console_kthread_printing_exit();
+
+ mutex_unlock(&con->lock);
+ }
+
+ con_printk(KERN_INFO, con, "printing thread stopped\n");
+out:
+ kfree(dropped_text);
+ kfree(ext_text);
+ kfree(text);
+
+ console_lock();
+ /*
+ * If this kthread is being stopped by another task, con->thread will
+ * already be NULL. That is fine. The important thing is that it is
+ * NULL after the kthread exits.
+ */
+ con->thread = NULL;
+ console_unlock();
+
+ return 0;
+}
+
+/* Must be called under console_lock. */
+static void printk_start_kthread(struct console *con)
+{
+ /*
+ * Do not start a kthread if there is no write() callback. The
+ * kthreads assume the write() callback exists.
+ */
+ if (!con->write)
+ return;
+
+ con->thread = kthread_run(printk_kthread_func, con,
+ "pr/%s%d", con->name, con->index);
+ if (IS_ERR(con->thread)) {
+ con->thread = NULL;
+ con_printk(KERN_ERR, con, "unable to start printing thread\n");
+ __printk_fallback_preferred_direct();
+ return;
+ }
+}
+
/*
* Delayed printk version, for scheduler-internal messages:
*/
-#define PRINTK_PENDING_WAKEUP 0x01
-#define PRINTK_PENDING_OUTPUT 0x02
+#define PRINTK_PENDING_WAKEUP 0x01
+#define PRINTK_PENDING_DIRECT_OUTPUT 0x02
static DEFINE_PER_CPU(int, printk_pending);
@@ -3297,10 +3893,14 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
{
int pending = this_cpu_xchg(printk_pending, 0);
- if (pending & PRINTK_PENDING_OUTPUT) {
+ if (pending & PRINTK_PENDING_DIRECT_OUTPUT) {
+ printk_prefer_direct_enter();
+
/* If trylock fails, someone else is doing the printing */
if (console_trylock())
console_unlock();
+
+ printk_prefer_direct_exit();
}
if (pending & PRINTK_PENDING_WAKEUP)
@@ -3310,28 +3910,54 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) =
IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func);
-void wake_up_klogd(void)
+static void __wake_up_klogd(int val)
{
if (!printk_percpu_data_ready())
return;
preempt_disable();
- if (waitqueue_active(&log_wait)) {
- this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
+ /*
+ * Guarantee any new records can be seen by tasks preparing to wait
+ * before this context checks if the wait queue is empty.
+ *
+ * The full memory barrier within wq_has_sleeper() pairs with the full
+ * memory barrier within set_current_state() of
+ * prepare_to_wait_event(), which is called after ___wait_event() adds
+ * the waiter but before it has checked the wait condition.
+ *
+ * This pairs with devkmsg_read:A, syslog_print:A, and
+ * printk_kthread_func:A.
+ */
+ if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */
+ (val & PRINTK_PENDING_DIRECT_OUTPUT)) {
+ this_cpu_or(printk_pending, val);
irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
}
preempt_enable();
}
+void wake_up_klogd(void)
+{
+ __wake_up_klogd(PRINTK_PENDING_WAKEUP);
+}
+
void defer_console_output(void)
{
- if (!printk_percpu_data_ready())
- return;
+ /*
+ * New messages may have been added directly to the ringbuffer
+ * using vprintk_store(), so wake any waiters as well.
+ */
+ int val = PRINTK_PENDING_WAKEUP;
- preempt_disable();
- this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
- irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
- preempt_enable();
+ /*
+ * Make sure that some context will print the messages when direct
+ * printing is allowed. This happens in situations when the kthreads
+ * may not be as reliable or perhaps unusable.
+ */
+ if (allow_direct_printing())
+ val |= PRINTK_PENDING_DIRECT_OUTPUT;
+
+ __wake_up_klogd(val);
}
void printk_trigger_flush(void)
@@ -3667,26 +4293,26 @@ EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
#endif
#ifdef CONFIG_SMP
-static atomic_t printk_cpulock_owner = ATOMIC_INIT(-1);
-static atomic_t printk_cpulock_nested = ATOMIC_INIT(0);
+static atomic_t printk_cpu_sync_owner = ATOMIC_INIT(-1);
+static atomic_t printk_cpu_sync_nested = ATOMIC_INIT(0);
/**
- * __printk_wait_on_cpu_lock() - Busy wait until the printk cpu-reentrant
- * spinning lock is not owned by any CPU.
+ * __printk_cpu_sync_wait() - Busy wait until the printk cpu-reentrant
+ * spinning lock is not owned by any CPU.
*
* Context: Any context.
*/
-void __printk_wait_on_cpu_lock(void)
+void __printk_cpu_sync_wait(void)
{
do {
cpu_relax();
- } while (atomic_read(&printk_cpulock_owner) != -1);
+ } while (atomic_read(&printk_cpu_sync_owner) != -1);
}
-EXPORT_SYMBOL(__printk_wait_on_cpu_lock);
+EXPORT_SYMBOL(__printk_cpu_sync_wait);
/**
- * __printk_cpu_trylock() - Try to acquire the printk cpu-reentrant
- * spinning lock.
+ * __printk_cpu_sync_try_get() - Try to acquire the printk cpu-reentrant
+ * spinning lock.
*
* If no processor has the lock, the calling processor takes the lock and
* becomes the owner. If the calling processor is already the owner of the
@@ -3695,7 +4321,7 @@ EXPORT_SYMBOL(__printk_wait_on_cpu_lock);
* Context: Any context. Expects interrupts to be disabled.
* Return: 1 on success, otherwise 0.
*/
-int __printk_cpu_trylock(void)
+int __printk_cpu_sync_try_get(void)
{
int cpu;
int old;
@@ -3705,79 +4331,80 @@ int __printk_cpu_trylock(void)
/*
* Guarantee loads and stores from this CPU when it is the lock owner
* are _not_ visible to the previous lock owner. This pairs with
- * __printk_cpu_unlock:B.
+ * __printk_cpu_sync_put:B.
*
* Memory barrier involvement:
*
- * If __printk_cpu_trylock:A reads from __printk_cpu_unlock:B, then
- * __printk_cpu_unlock:A can never read from __printk_cpu_trylock:B.
+ * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B,
+ * then __printk_cpu_sync_put:A can never read from
+ * __printk_cpu_sync_try_get:B.
*
* Relies on:
*
- * RELEASE from __printk_cpu_unlock:A to __printk_cpu_unlock:B
+ * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B
* of the previous CPU
* matching
- * ACQUIRE from __printk_cpu_trylock:A to __printk_cpu_trylock:B
- * of this CPU
+ * ACQUIRE from __printk_cpu_sync_try_get:A to
+ * __printk_cpu_sync_try_get:B of this CPU
*/
- old = atomic_cmpxchg_acquire(&printk_cpulock_owner, -1,
- cpu); /* LMM(__printk_cpu_trylock:A) */
+ old = atomic_cmpxchg_acquire(&printk_cpu_sync_owner, -1,
+ cpu); /* LMM(__printk_cpu_sync_try_get:A) */
if (old == -1) {
/*
* This CPU is now the owner and begins loading/storing
- * data: LMM(__printk_cpu_trylock:B)
+ * data: LMM(__printk_cpu_sync_try_get:B)
*/
return 1;
} else if (old == cpu) {
/* This CPU is already the owner. */
- atomic_inc(&printk_cpulock_nested);
+ atomic_inc(&printk_cpu_sync_nested);
return 1;
}
return 0;
}
-EXPORT_SYMBOL(__printk_cpu_trylock);
+EXPORT_SYMBOL(__printk_cpu_sync_try_get);
/**
- * __printk_cpu_unlock() - Release the printk cpu-reentrant spinning lock.
+ * __printk_cpu_sync_put() - Release the printk cpu-reentrant spinning lock.
*
* The calling processor must be the owner of the lock.
*
* Context: Any context. Expects interrupts to be disabled.
*/
-void __printk_cpu_unlock(void)
+void __printk_cpu_sync_put(void)
{
- if (atomic_read(&printk_cpulock_nested)) {
- atomic_dec(&printk_cpulock_nested);
+ if (atomic_read(&printk_cpu_sync_nested)) {
+ atomic_dec(&printk_cpu_sync_nested);
return;
}
/*
* This CPU is finished loading/storing data:
- * LMM(__printk_cpu_unlock:A)
+ * LMM(__printk_cpu_sync_put:A)
*/
/*
* Guarantee loads and stores from this CPU when it was the
* lock owner are visible to the next lock owner. This pairs
- * with __printk_cpu_trylock:A.
+ * with __printk_cpu_sync_try_get:A.
*
* Memory barrier involvement:
*
- * If __printk_cpu_trylock:A reads from __printk_cpu_unlock:B,
- * then __printk_cpu_trylock:B reads from __printk_cpu_unlock:A.
+ * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B,
+ * then __printk_cpu_sync_try_get:B reads from __printk_cpu_sync_put:A.
*
* Relies on:
*
- * RELEASE from __printk_cpu_unlock:A to __printk_cpu_unlock:B
+ * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B
* of this CPU
* matching
- * ACQUIRE from __printk_cpu_trylock:A to __printk_cpu_trylock:B
- * of the next CPU
+ * ACQUIRE from __printk_cpu_sync_try_get:A to
+ * __printk_cpu_sync_try_get:B of the next CPU
*/
- atomic_set_release(&printk_cpulock_owner,
- -1); /* LMM(__printk_cpu_unlock:B) */
+ atomic_set_release(&printk_cpu_sync_owner,
+ -1); /* LMM(__printk_cpu_sync_put:B) */
}
-EXPORT_SYMBOL(__printk_cpu_unlock);
+EXPORT_SYMBOL(__printk_cpu_sync_put);
#endif /* CONFIG_SMP */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index ccc4b465775b..156a99283b11 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -185,7 +185,12 @@ static bool looks_like_a_spurious_pid(struct task_struct *task)
return true;
}
-/* Ensure that nothing can wake it up, even SIGKILL */
+/*
+ * Ensure that nothing can wake it up, even SIGKILL
+ *
+ * A task is switched to this state while a ptrace operation is in progress;
+ * such that the ptrace operation is uninterruptible.
+ */
static bool ptrace_freeze_traced(struct task_struct *task)
{
bool ret = false;
@@ -197,7 +202,7 @@ static bool ptrace_freeze_traced(struct task_struct *task)
spin_lock_irq(&task->sighand->siglock);
if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
!__fatal_signal_pending(task)) {
- WRITE_ONCE(task->__state, __TASK_TRACED);
+ task->jobctl |= JOBCTL_PTRACE_FROZEN;
ret = true;
}
spin_unlock_irq(&task->sighand->siglock);
@@ -207,23 +212,21 @@ static bool ptrace_freeze_traced(struct task_struct *task)
static void ptrace_unfreeze_traced(struct task_struct *task)
{
- if (READ_ONCE(task->__state) != __TASK_TRACED)
- return;
-
- WARN_ON(!task->ptrace || task->parent != current);
+ unsigned long flags;
/*
- * PTRACE_LISTEN can allow ptrace_trap_notify to wake us up remotely.
- * Recheck state under the lock to close this race.
+ * The child may be awake and may have cleared
+ * JOBCTL_PTRACE_FROZEN (see ptrace_resume). The child will
+ * not set JOBCTL_PTRACE_FROZEN or enter __TASK_TRACED anew.
*/
- spin_lock_irq(&task->sighand->siglock);
- if (READ_ONCE(task->__state) == __TASK_TRACED) {
- if (__fatal_signal_pending(task))
+ if (lock_task_sighand(task, &flags)) {
+ task->jobctl &= ~JOBCTL_PTRACE_FROZEN;
+ if (__fatal_signal_pending(task)) {
+ task->jobctl &= ~TASK_TRACED;
wake_up_state(task, __TASK_TRACED);
- else
- WRITE_ONCE(task->__state, TASK_TRACED);
+ }
+ unlock_task_sighand(task, &flags);
}
- spin_unlock_irq(&task->sighand->siglock);
}
/**
@@ -256,7 +259,6 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
*/
read_lock(&tasklist_lock);
if (child->ptrace && child->parent == current) {
- WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED);
/*
* child->sighand can't be NULL, release_task()
* does ptrace_unlink() before __exit_signal().
@@ -266,17 +268,9 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
}
read_unlock(&tasklist_lock);
- if (!ret && !ignore_state) {
- if (!wait_task_inactive(child, __TASK_TRACED)) {
- /*
- * This can only happen if may_ptrace_stop() fails and
- * ptrace_stop() changes ->state back to TASK_RUNNING,
- * so we should not worry about leaking __TASK_TRACED.
- */
- WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED);
- ret = -ESRCH;
- }
- }
+ if (!ret && !ignore_state &&
+ WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED)))
+ ret = -ESRCH;
return ret;
}
@@ -475,8 +469,10 @@ static int ptrace_attach(struct task_struct *task, long request,
* in and out of STOPPED are protected by siglock.
*/
if (task_is_stopped(task) &&
- task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
+ task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) {
+ task->jobctl &= ~JOBCTL_STOPPED;
signal_wake_up_state(task, __TASK_STOPPED);
+ }
spin_unlock(&task->sighand->siglock);
@@ -829,11 +825,7 @@ static long ptrace_get_rseq_configuration(struct task_struct *task,
}
#endif
-#ifdef PTRACE_SINGLESTEP
#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP)
-#else
-#define is_singlestep(request) 0
-#endif
#ifdef PTRACE_SINGLEBLOCK
#define is_singleblock(request) ((request) == PTRACE_SINGLEBLOCK)
@@ -850,8 +842,6 @@ static long ptrace_get_rseq_configuration(struct task_struct *task,
static int ptrace_resume(struct task_struct *child, long request,
unsigned long data)
{
- bool need_siglock;
-
if (!valid_signal(data))
return -EIO;
@@ -887,18 +877,12 @@ static int ptrace_resume(struct task_struct *child, long request,
* Note that we need siglock even if ->exit_code == data and/or this
* status was not reported yet, the new status must not be cleared by
* wait_task_stopped() after resume.
- *
- * If data == 0 we do not care if wait_task_stopped() reports the old
- * status and clears the code too; this can't race with the tracee, it
- * takes siglock after resume.
*/
- need_siglock = data && !thread_group_empty(current);
- if (need_siglock)
- spin_lock_irq(&child->sighand->siglock);
+ spin_lock_irq(&child->sighand->siglock);
child->exit_code = data;
+ child->jobctl &= ~JOBCTL_TRACED;
wake_up_state(child, __TASK_TRACED);
- if (need_siglock)
- spin_unlock_irq(&child->sighand->siglock);
+ spin_unlock_irq(&child->sighand->siglock);
return 0;
}
@@ -1221,9 +1205,7 @@ int ptrace_request(struct task_struct *child, long request,
}
#endif
-#ifdef PTRACE_SINGLESTEP
case PTRACE_SINGLESTEP:
-#endif
#ifdef PTRACE_SINGLEBLOCK
case PTRACE_SINGLEBLOCK:
#endif
@@ -1236,9 +1218,8 @@ int ptrace_request(struct task_struct *child, long request,
return ptrace_resume(child, request, data);
case PTRACE_KILL:
- if (child->exit_state) /* already dead */
- return 0;
- return ptrace_resume(child, request, SIGKILL);
+ send_sig_info(SIGKILL, SEND_SIG_NOINFO, child);
+ return 0;
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
case PTRACE_GETREGSET:
@@ -1285,10 +1266,6 @@ int ptrace_request(struct task_struct *child, long request,
return ret;
}
-#ifndef arch_ptrace_attach
-#define arch_ptrace_attach(child) do { } while (0)
-#endif
-
SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
unsigned long, data)
{
@@ -1297,8 +1274,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
if (request == PTRACE_TRACEME) {
ret = ptrace_traceme();
- if (!ret)
- arch_ptrace_attach(current);
goto out;
}
@@ -1310,12 +1285,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
ret = ptrace_attach(child, request, addr, data);
- /*
- * Some architectures need to do book-keeping after
- * a ptrace attach.
- */
- if (!ret)
- arch_ptrace_attach(child);
goto out_put_task_struct;
}
@@ -1455,12 +1424,6 @@ COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid,
if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
ret = ptrace_attach(child, request, addr, data);
- /*
- * Some architectures need to do book-keeping after
- * a ptrace attach.
- */
- if (!ret)
- arch_ptrace_attach(child);
goto out_put_task_struct;
}
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 152492d52715..4916077119f3 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -23,6 +23,8 @@
#define RCU_SEQ_CTR_SHIFT 2
#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1)
+extern int sysctl_sched_rt_runtime;
+
/*
* Return the counter portion of a sequence number previously returned
* by rcu_seq_snap() or rcu_seq_current().
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index a001e1e7a992..4995c078cff9 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -647,6 +647,7 @@ static void print_cpu_stall(unsigned long gps)
* See Documentation/RCU/stallwarn.rst for info on how to debug
* RCU CPU stall warnings.
*/
+ printk_prefer_direct_enter();
trace_rcu_stall_warning(rcu_state.name, TPS("SelfDetected"));
pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name);
raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
@@ -684,6 +685,7 @@ static void print_cpu_stall(unsigned long gps)
*/
set_tsk_need_resched(current);
set_preempt_need_resched();
+ printk_prefer_direct_exit();
}
static void check_cpu_stall(struct rcu_data *rdp)
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 6bcc5d6a6572..a091145ee710 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -23,7 +23,7 @@
* this indicates whether you can reboot with ctrl-alt-del: the default is yes
*/
-int C_A_D = 1;
+static int C_A_D = 1;
struct pid *cad_pid;
EXPORT_SYMBOL(cad_pid);
@@ -48,12 +48,20 @@ int reboot_cpu;
enum reboot_type reboot_type = BOOT_ACPI;
int reboot_force;
+struct sys_off_handler {
+ struct notifier_block nb;
+ int (*sys_off_cb)(struct sys_off_data *data);
+ void *cb_data;
+ enum sys_off_mode mode;
+ bool blocking;
+ void *list;
+};
+
/*
- * If set, this is used for preparing the system to power off.
+ * Temporary stub that prevents linkage failure while we're in process
+ * of removing all uses of legacy pm_power_off() around the kernel.
*/
-
-void (*pm_power_off_prepare)(void);
-EXPORT_SYMBOL_GPL(pm_power_off_prepare);
+void __weak (*pm_power_off)(void);
/**
* emergency_restart - reboot the system
@@ -281,6 +289,316 @@ void kernel_halt(void)
}
EXPORT_SYMBOL_GPL(kernel_halt);
+/*
+ * Notifier list for kernel code which wants to be called
+ * to prepare system for power off.
+ */
+static BLOCKING_NOTIFIER_HEAD(power_off_prep_handler_list);
+
+/*
+ * Notifier list for kernel code which wants to be called
+ * to power off system.
+ */
+static ATOMIC_NOTIFIER_HEAD(power_off_handler_list);
+
+static int sys_off_notify(struct notifier_block *nb,
+ unsigned long mode, void *cmd)
+{
+ struct sys_off_handler *handler;
+ struct sys_off_data data = {};
+
+ handler = container_of(nb, struct sys_off_handler, nb);
+ data.cb_data = handler->cb_data;
+ data.mode = mode;
+ data.cmd = cmd;
+
+ return handler->sys_off_cb(&data);
+}
+
+/**
+ * register_sys_off_handler - Register sys-off handler
+ * @mode: Sys-off mode
+ * @priority: Handler priority
+ * @callback: Callback function
+ * @cb_data: Callback argument
+ *
+ * Registers system power-off or restart handler that will be invoked
+ * at the step corresponding to the given sys-off mode. Handler's callback
+ * should return NOTIFY_DONE to permit execution of the next handler in
+ * the call chain or NOTIFY_STOP to break the chain (in error case for
+ * example).
+ *
+ * Multiple handlers can be registered at the default priority level.
+ *
+ * Only one handler can be registered at the non-default priority level,
+ * otherwise ERR_PTR(-EBUSY) is returned.
+ *
+ * Returns a new instance of struct sys_off_handler on success, or
+ * an ERR_PTR()-encoded error code otherwise.
+ */
+struct sys_off_handler *
+register_sys_off_handler(enum sys_off_mode mode,
+ int priority,
+ int (*callback)(struct sys_off_data *data),
+ void *cb_data)
+{
+ struct sys_off_handler *handler;
+ int err;
+
+ handler = kzalloc(sizeof(*handler), GFP_KERNEL);
+ if (!handler)
+ return ERR_PTR(-ENOMEM);
+
+ switch (mode) {
+ case SYS_OFF_MODE_POWER_OFF_PREPARE:
+ handler->list = &power_off_prep_handler_list;
+ handler->blocking = true;
+ break;
+
+ case SYS_OFF_MODE_POWER_OFF:
+ handler->list = &power_off_handler_list;
+ break;
+
+ case SYS_OFF_MODE_RESTART:
+ handler->list = &restart_handler_list;
+ break;
+
+ default:
+ kfree(handler);
+ return ERR_PTR(-EINVAL);
+ }
+
+ handler->nb.notifier_call = sys_off_notify;
+ handler->nb.priority = priority;
+ handler->sys_off_cb = callback;
+ handler->cb_data = cb_data;
+ handler->mode = mode;
+
+ if (handler->blocking) {
+ if (priority == SYS_OFF_PRIO_DEFAULT)
+ err = blocking_notifier_chain_register(handler->list,
+ &handler->nb);
+ else
+ err = blocking_notifier_chain_register_unique_prio(handler->list,
+ &handler->nb);
+ } else {
+ if (priority == SYS_OFF_PRIO_DEFAULT)
+ err = atomic_notifier_chain_register(handler->list,
+ &handler->nb);
+ else
+ err = atomic_notifier_chain_register_unique_prio(handler->list,
+ &handler->nb);
+ }
+
+ if (err) {
+ kfree(handler);
+ return ERR_PTR(err);
+ }
+
+ return handler;
+}
+EXPORT_SYMBOL_GPL(register_sys_off_handler);
+
+/**
+ * unregister_sys_off_handler - Unregister sys-off handler
+ * @handler: Sys-off handler
+ *
+ * Unregisters given sys-off handler.
+ */
+void unregister_sys_off_handler(struct sys_off_handler *handler)
+{
+ int err;
+
+ if (!handler)
+ return;
+
+ if (handler->blocking)
+ err = blocking_notifier_chain_unregister(handler->list,
+ &handler->nb);
+ else
+ err = atomic_notifier_chain_unregister(handler->list,
+ &handler->nb);
+
+ /* sanity check, shall never happen */
+ WARN_ON(err);
+
+ kfree(handler);
+}
+EXPORT_SYMBOL_GPL(unregister_sys_off_handler);
+
+static void devm_unregister_sys_off_handler(void *data)
+{
+ struct sys_off_handler *handler = data;
+
+ unregister_sys_off_handler(handler);
+}
+
+/**
+ * devm_register_sys_off_handler - Register sys-off handler
+ * @dev: Device that registers handler
+ * @mode: Sys-off mode
+ * @priority: Handler priority
+ * @callback: Callback function
+ * @cb_data: Callback argument
+ *
+ * Registers resource-managed sys-off handler.
+ *
+ * Returns zero on success, or error code on failure.
+ */
+int devm_register_sys_off_handler(struct device *dev,
+ enum sys_off_mode mode,
+ int priority,
+ int (*callback)(struct sys_off_data *data),
+ void *cb_data)
+{
+ struct sys_off_handler *handler;
+
+ handler = register_sys_off_handler(mode, priority, callback, cb_data);
+ if (IS_ERR(handler))
+ return PTR_ERR(handler);
+
+ return devm_add_action_or_reset(dev, devm_unregister_sys_off_handler,
+ handler);
+}
+EXPORT_SYMBOL_GPL(devm_register_sys_off_handler);
+
+/**
+ * devm_register_power_off_handler - Register power-off handler
+ * @dev: Device that registers callback
+ * @callback: Callback function
+ * @cb_data: Callback's argument
+ *
+ * Registers resource-managed sys-off handler with a default priority
+ * and using power-off mode.
+ *
+ * Returns zero on success, or error code on failure.
+ */
+int devm_register_power_off_handler(struct device *dev,
+ int (*callback)(struct sys_off_data *data),
+ void *cb_data)
+{
+ return devm_register_sys_off_handler(dev,
+ SYS_OFF_MODE_POWER_OFF,
+ SYS_OFF_PRIO_DEFAULT,
+ callback, cb_data);
+}
+EXPORT_SYMBOL_GPL(devm_register_power_off_handler);
+
+/**
+ * devm_register_restart_handler - Register restart handler
+ * @dev: Device that registers callback
+ * @callback: Callback function
+ * @cb_data: Callback's argument
+ *
+ * Registers resource-managed sys-off handler with a default priority
+ * and using restart mode.
+ *
+ * Returns zero on success, or error code on failure.
+ */
+int devm_register_restart_handler(struct device *dev,
+ int (*callback)(struct sys_off_data *data),
+ void *cb_data)
+{
+ return devm_register_sys_off_handler(dev,
+ SYS_OFF_MODE_RESTART,
+ SYS_OFF_PRIO_DEFAULT,
+ callback, cb_data);
+}
+EXPORT_SYMBOL_GPL(devm_register_restart_handler);
+
+static struct sys_off_handler *platform_power_off_handler;
+
+static int platform_power_off_notify(struct sys_off_data *data)
+{
+ void (*platform_power_power_off_cb)(void) = data->cb_data;
+
+ platform_power_power_off_cb();
+
+ return NOTIFY_DONE;
+}
+
+/**
+ * register_platform_power_off - Register platform-level power-off callback
+ * @power_off: Power-off callback
+ *
+ * Registers power-off callback that will be called as last step
+ * of the power-off sequence. This callback is expected to be invoked
+ * for the last resort. Only one platform power-off callback is allowed
+ * to be registered at a time.
+ *
+ * Returns zero on success, or error code on failure.
+ */
+int register_platform_power_off(void (*power_off)(void))
+{
+ struct sys_off_handler *handler;
+
+ handler = register_sys_off_handler(SYS_OFF_MODE_POWER_OFF,
+ SYS_OFF_PRIO_PLATFORM,
+ platform_power_off_notify,
+ power_off);
+ if (IS_ERR(handler))
+ return PTR_ERR(handler);
+
+ platform_power_off_handler = handler;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(register_platform_power_off);
+
+/**
+ * unregister_platform_power_off - Unregister platform-level power-off callback
+ * @power_off: Power-off callback
+ *
+ * Unregisters previously registered platform power-off callback.
+ */
+void unregister_platform_power_off(void (*power_off)(void))
+{
+ if (platform_power_off_handler &&
+ platform_power_off_handler->cb_data == power_off) {
+ unregister_sys_off_handler(platform_power_off_handler);
+ platform_power_off_handler = NULL;
+ }
+}
+EXPORT_SYMBOL_GPL(unregister_platform_power_off);
+
+static int legacy_pm_power_off(struct sys_off_data *data)
+{
+ if (pm_power_off)
+ pm_power_off();
+
+ return NOTIFY_DONE;
+}
+
+static void do_kernel_power_off_prepare(void)
+{
+ blocking_notifier_call_chain(&power_off_prep_handler_list, 0, NULL);
+}
+
+/**
+ * do_kernel_power_off - Execute kernel power-off handler call chain
+ *
+ * Expected to be called as last step of the power-off sequence.
+ *
+ * Powers off the system immediately if a power-off handler function has
+ * been registered. Otherwise does nothing.
+ */
+void do_kernel_power_off(void)
+{
+ atomic_notifier_call_chain(&power_off_handler_list, 0, NULL);
+}
+
+/**
+ * kernel_can_power_off - check whether system can be powered off
+ *
+ * Returns true if power-off handler is registered and system can be
+ * powered off, false otherwise.
+ */
+bool kernel_can_power_off(void)
+{
+ return !atomic_notifier_call_chain_is_empty(&power_off_handler_list);
+}
+EXPORT_SYMBOL_GPL(kernel_can_power_off);
+
/**
* kernel_power_off - power_off the system
*
@@ -289,8 +607,7 @@ EXPORT_SYMBOL_GPL(kernel_halt);
void kernel_power_off(void)
{
kernel_shutdown_prepare(SYSTEM_POWER_OFF);
- if (pm_power_off_prepare)
- pm_power_off_prepare();
+ do_kernel_power_off_prepare();
migrate_to_reboot_cpu();
syscore_shutdown();
pr_emerg("Power down\n");
@@ -313,6 +630,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
void __user *, arg)
{
struct pid_namespace *pid_ns = task_active_pid_ns(current);
+ struct sys_off_handler *sys_off = NULL;
char buffer[256];
int ret = 0;
@@ -337,10 +655,25 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
if (ret)
return ret;
+ /*
+ * Register sys-off handlers for legacy PM callback. This allows
+ * legacy PM callbacks temporary co-exist with the new sys-off API.
+ *
+ * TODO: Remove legacy handlers once all legacy PM users will be
+ * switched to the sys-off based APIs.
+ */
+ if (pm_power_off) {
+ sys_off = register_sys_off_handler(SYS_OFF_MODE_POWER_OFF,
+ SYS_OFF_PRIO_DEFAULT,
+ legacy_pm_power_off, NULL);
+ if (IS_ERR(sys_off))
+ return PTR_ERR(sys_off);
+ }
+
/* Instead of trying to make the power_off code look like
* halt when pm_power_off is not set do it the easy way.
*/
- if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
+ if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !kernel_can_power_off())
cmd = LINUX_REBOOT_CMD_HALT;
mutex_lock(&system_transition_mutex);
@@ -394,6 +727,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
break;
}
mutex_unlock(&system_transition_mutex);
+ unregister_sys_off_handler(sys_off);
return ret;
}
@@ -417,7 +751,8 @@ void ctrl_alt_del(void)
kill_cad_pid(SIGINT, 1);
}
-char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
+#define POWEROFF_CMD_PATH_LEN 256
+static char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
static const char reboot_cmd[] = "/sbin/reboot";
static int run_cmd(const char *cmd)
@@ -447,9 +782,11 @@ static int __orderly_reboot(void)
ret = run_cmd(reboot_cmd);
if (ret) {
+ printk_prefer_direct_enter();
pr_warn("Failed to start orderly reboot: forcing the issue\n");
emergency_sync();
kernel_restart(NULL);
+ printk_prefer_direct_exit();
}
return ret;
@@ -462,6 +799,7 @@ static int __orderly_poweroff(bool force)
ret = run_cmd(poweroff_cmd);
if (ret && force) {
+ printk_prefer_direct_enter();
pr_warn("Failed to start orderly shutdown: forcing the issue\n");
/*
@@ -471,6 +809,7 @@ static int __orderly_poweroff(bool force)
*/
emergency_sync();
kernel_power_off();
+ printk_prefer_direct_exit();
}
return ret;
@@ -528,6 +867,8 @@ EXPORT_SYMBOL_GPL(orderly_reboot);
*/
static void hw_failure_emergency_poweroff_func(struct work_struct *work)
{
+ printk_prefer_direct_enter();
+
/*
* We have reached here after the emergency shutdown waiting period has
* expired. This means orderly_poweroff has not been able to shut off
@@ -544,6 +885,8 @@ static void hw_failure_emergency_poweroff_func(struct work_struct *work)
*/
pr_emerg("Hardware protection shutdown failed. Trying emergency restart\n");
emergency_restart();
+
+ printk_prefer_direct_exit();
}
static DECLARE_DELAYED_WORK(hw_failure_emergency_poweroff_work,
@@ -582,11 +925,13 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced)
{
static atomic_t allow_proceed = ATOMIC_INIT(1);
+ printk_prefer_direct_enter();
+
pr_emerg("HARDWARE PROTECTION shutdown (%s)\n", reason);
/* Shutdown should be initiated only once. */
if (!atomic_dec_and_test(&allow_proceed))
- return;
+ goto out;
/*
* Queue a backup emergency shutdown in the event of
@@ -594,6 +939,8 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced)
*/
hw_failure_emergency_poweroff(ms_until_forced);
orderly_poweroff(true);
+out:
+ printk_prefer_direct_exit();
}
EXPORT_SYMBOL_GPL(hw_protection_shutdown);
@@ -867,6 +1214,33 @@ static struct attribute *reboot_attrs[] = {
NULL,
};
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kern_reboot_table[] = {
+ {
+ .procname = "poweroff_cmd",
+ .data = &poweroff_cmd,
+ .maxlen = POWEROFF_CMD_PATH_LEN,
+ .mode = 0644,
+ .proc_handler = proc_dostring,
+ },
+ {
+ .procname = "ctrl-alt-del",
+ .data = &C_A_D,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+static void __init kernel_reboot_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_reboot_table);
+}
+#else
+#define kernel_reboot_sysctls_init() do { } while (0)
+#endif /* CONFIG_SYSCTL */
+
static const struct attribute_group reboot_attr_group = {
.attrs = reboot_attrs,
};
@@ -886,6 +1260,8 @@ static int __init reboot_ksysfs_init(void)
return ret;
}
+ kernel_reboot_sysctls_init();
+
return 0;
}
late_initcall(reboot_ksysfs_init);
diff --git a/kernel/relay.c b/kernel/relay.c
index d1a67fbb819d..6a611e779e95 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -440,7 +440,7 @@ int relay_prepare_cpu(unsigned int cpu)
mutex_lock(&relay_channels_mutex);
list_for_each_entry(chan, &relay_channels, list) {
- if ((buf = *per_cpu_ptr(chan->buf, cpu)))
+ if (*per_cpu_ptr(chan->buf, cpu))
continue;
buf = relay_open_buf(chan, cpu);
if (!buf) {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a247f8d9d417..bfa7452ca92e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -148,12 +148,6 @@ const_debug unsigned int sysctl_sched_nr_migrate = 8;
const_debug unsigned int sysctl_sched_nr_migrate = 32;
#endif
-/*
- * period over which we measure -rt task CPU usage in us.
- * default: 1s
- */
-unsigned int sysctl_sched_rt_period = 1000000;
-
__read_mostly int scheduler_running;
#ifdef CONFIG_SCHED_CORE
@@ -448,13 +442,6 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
#endif /* CONFIG_SCHED_CORE */
/*
- * part of the period that we allow rt tasks to run in us.
- * default: 0.95s
- */
-int sysctl_sched_rt_runtime = 950000;
-
-
-/*
* Serialization rules:
*
* Lock order:
@@ -1322,10 +1309,10 @@ static void set_load_weight(struct task_struct *p, bool update_load)
static DEFINE_MUTEX(uclamp_mutex);
/* Max allowed minimum utilization */
-unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
+static unsigned int __maybe_unused sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
/* Max allowed maximum utilization */
-unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
+static unsigned int __maybe_unused sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
/*
* By default RT tasks run at the maximum performance point/capacity of the
@@ -1342,7 +1329,7 @@ unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
* This knob will not override the system default sched_util_clamp_min defined
* above.
*/
-unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
+static unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
/* All clamps are required to be less or equal than these values */
static struct uclamp_se uclamp_default[UCLAMP_CNT];
@@ -1472,33 +1459,6 @@ static void uclamp_update_util_min_rt_default(struct task_struct *p)
task_rq_unlock(rq, p, &rf);
}
-static void uclamp_sync_util_min_rt_default(void)
-{
- struct task_struct *g, *p;
-
- /*
- * copy_process() sysctl_uclamp
- * uclamp_min_rt = X;
- * write_lock(&tasklist_lock) read_lock(&tasklist_lock)
- * // link thread smp_mb__after_spinlock()
- * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
- * sched_post_fork() for_each_process_thread()
- * __uclamp_sync_rt() __uclamp_sync_rt()
- *
- * Ensures that either sched_post_fork() will observe the new
- * uclamp_min_rt or for_each_process_thread() will observe the new
- * task.
- */
- read_lock(&tasklist_lock);
- smp_mb__after_spinlock();
- read_unlock(&tasklist_lock);
-
- rcu_read_lock();
- for_each_process_thread(g, p)
- uclamp_update_util_min_rt_default(p);
- rcu_read_unlock();
-}
-
static inline struct uclamp_se
uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
{
@@ -1778,6 +1738,11 @@ uclamp_update_active_tasks(struct cgroup_subsys_state *css)
}
static void cpu_util_update_eff(struct cgroup_subsys_state *css);
+#endif
+
+#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_UCLAMP_TASK
+#ifdef CONFIG_UCLAMP_TASK_GROUP
static void uclamp_update_root_tg(void)
{
struct task_group *tg = &root_task_group;
@@ -1795,7 +1760,34 @@ static void uclamp_update_root_tg(void)
static void uclamp_update_root_tg(void) { }
#endif
-int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
+static void uclamp_sync_util_min_rt_default(void)
+{
+ struct task_struct *g, *p;
+
+ /*
+ * copy_process() sysctl_uclamp
+ * uclamp_min_rt = X;
+ * write_lock(&tasklist_lock) read_lock(&tasklist_lock)
+ * // link thread smp_mb__after_spinlock()
+ * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
+ * sched_post_fork() for_each_process_thread()
+ * __uclamp_sync_rt() __uclamp_sync_rt()
+ *
+ * Ensures that either sched_post_fork() will observe the new
+ * uclamp_min_rt or for_each_process_thread() will observe the new
+ * task.
+ */
+ read_lock(&tasklist_lock);
+ smp_mb__after_spinlock();
+ read_unlock(&tasklist_lock);
+
+ rcu_read_lock();
+ for_each_process_thread(g, p)
+ uclamp_update_util_min_rt_default(p);
+ rcu_read_unlock();
+}
+
+static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
bool update_root_tg = false;
@@ -1859,6 +1851,8 @@ done:
return result;
}
+#endif
+#endif
static int uclamp_validate(struct task_struct *p,
const struct sched_attr *attr)
@@ -4433,7 +4427,7 @@ out:
__setup("schedstats=", setup_schedstats);
#ifdef CONFIG_PROC_SYSCTL
-int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
+static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
@@ -4455,6 +4449,52 @@ int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
#endif /* CONFIG_PROC_SYSCTL */
#endif /* CONFIG_SCHEDSTATS */
+#ifdef CONFIG_SYSCTL
+static struct ctl_table sched_core_sysctls[] = {
+#ifdef CONFIG_SCHEDSTATS
+ {
+ .procname = "sched_schedstats",
+ .data = NULL,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_schedstats,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif /* CONFIG_SCHEDSTATS */
+#ifdef CONFIG_UCLAMP_TASK
+ {
+ .procname = "sched_util_clamp_min",
+ .data = &sysctl_sched_uclamp_util_min,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_sched_uclamp_handler,
+ },
+ {
+ .procname = "sched_util_clamp_max",
+ .data = &sysctl_sched_uclamp_util_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_sched_uclamp_handler,
+ },
+ {
+ .procname = "sched_util_clamp_min_rt_default",
+ .data = &sysctl_sched_uclamp_util_min_rt_default,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_sched_uclamp_handler,
+ },
+#endif /* CONFIG_UCLAMP_TASK */
+ {}
+};
+static int __init sched_core_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_core_sysctls);
+ return 0;
+}
+late_initcall(sched_core_sysctl_init);
+#endif /* CONFIG_SYSCTL */
+
/*
* fork()/clone()-time setup:
*/
@@ -6313,10 +6353,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
/*
* We must load prev->state once (task_struct::state is volatile), such
- * that:
- *
- * - we form a control dependency vs deactivate_task() below.
- * - ptrace_{,un}freeze_traced() can change ->state underneath us.
+ * that we form a control dependency vs deactivate_task() below.
*/
prev_state = READ_ONCE(prev->__state);
if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 936817ae142f..b5152961b743 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -16,6 +16,40 @@
* Fabio Checconi <fchecconi@gmail.com>
*/
+/*
+ * Default limits for DL period; on the top end we guard against small util
+ * tasks still getting ridiculously long effective runtimes, on the bottom end we
+ * guard against timer DoS.
+ */
+static unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */
+static unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */
+#ifdef CONFIG_SYSCTL
+static struct ctl_table sched_dl_sysctls[] = {
+ {
+ .procname = "sched_deadline_period_max_us",
+ .data = &sysctl_sched_dl_period_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_deadline_period_min_us",
+ .data = &sysctl_sched_dl_period_min,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {}
+};
+
+static int __init sched_dl_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_dl_sysctls);
+ return 0;
+}
+late_initcall(sched_dl_sysctl_init);
+#endif
+
static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
{
return container_of(dl_se, struct task_struct, dl);
@@ -2873,14 +2907,6 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
}
/*
- * Default limits for DL period; on the top end we guard against small util
- * tasks still getting ridiculously long effective runtimes, on the bottom end we
- * guard against timer DoS.
- */
-unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */
-unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */
-
-/*
* This function validates the new parameters of a -deadline task.
* We ask for the deadline not being zero, and greater or equal
* than the runtime, as well as the period of being zero or
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 906b2c7c48d1..77b2048a9326 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -174,7 +174,37 @@ int __weak arch_asym_cpu_priority(int cpu)
*
* (default: 5 msec, units: microseconds)
*/
-unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+#endif
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table sched_fair_sysctls[] = {
+ {
+ .procname = "sched_child_runs_first",
+ .data = &sysctl_sched_child_runs_first,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ .procname = "sched_cfs_bandwidth_slice_us",
+ .data = &sysctl_sched_cfs_bandwidth_slice,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ },
+#endif
+ {}
+};
+
+static int __init sched_fair_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_fair_sysctls);
+ return 0;
+}
+late_initcall(sched_fair_sysctl_init);
#endif
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
@@ -2897,7 +2927,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
/*
* We don't care about NUMA placement if we don't have memory.
*/
- if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
+ if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
return;
/*
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7891c0f0e1ff..8c9ed9664840 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -5,7 +5,6 @@
*/
int sched_rr_timeslice = RR_TIMESLICE;
-int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
/* More than 4 hours if BW_SHIFT equals 20. */
static const u64 max_rt_runtime = MAX_BW;
@@ -13,6 +12,57 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
struct rt_bandwidth def_rt_bandwidth;
+/*
+ * period over which we measure -rt task CPU usage in us.
+ * default: 1s
+ */
+unsigned int sysctl_sched_rt_period = 1000000;
+
+/*
+ * part of the period that we allow rt tasks to run in us.
+ * default: 0.95s
+ */
+int sysctl_sched_rt_runtime = 950000;
+
+#ifdef CONFIG_SYSCTL
+static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
+static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos);
+static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos);
+static struct ctl_table sched_rt_sysctls[] = {
+ {
+ .procname = "sched_rt_period_us",
+ .data = &sysctl_sched_rt_period,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_rt_handler,
+ },
+ {
+ .procname = "sched_rt_runtime_us",
+ .data = &sysctl_sched_rt_runtime,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sched_rt_handler,
+ },
+ {
+ .procname = "sched_rr_timeslice_ms",
+ .data = &sysctl_sched_rr_timeslice,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sched_rr_handler,
+ },
+ {}
+};
+
+static int __init sched_rt_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_rt_sysctls);
+ return 0;
+}
+late_initcall(sched_rt_sysctl_init);
+#endif
+
static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
{
struct rt_bandwidth *rt_b =
@@ -2862,6 +2912,7 @@ long sched_group_rt_period(struct task_group *tg)
return rt_period_us;
}
+#ifdef CONFIG_SYSCTL
static int sched_rt_global_constraints(void)
{
int ret = 0;
@@ -2872,6 +2923,7 @@ static int sched_rt_global_constraints(void)
return ret;
}
+#endif /* CONFIG_SYSCTL */
int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
{
@@ -2883,6 +2935,8 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
}
#else /* !CONFIG_RT_GROUP_SCHED */
+
+#ifdef CONFIG_SYSCTL
static int sched_rt_global_constraints(void)
{
unsigned long flags;
@@ -2900,8 +2954,10 @@ static int sched_rt_global_constraints(void)
return 0;
}
+#endif /* CONFIG_SYSCTL */
#endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_SYSCTL
static int sched_rt_global_validate(void)
{
if (sysctl_sched_rt_period <= 0)
@@ -2926,7 +2982,7 @@ static void sched_rt_do_global(void)
raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
}
-int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
+static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int old_period, old_runtime;
@@ -2965,7 +3021,7 @@ undo:
return ret;
}
-int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
+static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int ret;
@@ -2986,6 +3042,7 @@ int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
return ret;
}
+#endif /* CONFIG_SYSCTL */
#ifdef CONFIG_SCHED_DEBUG
void print_rt_stats(struct seq_file *m, int cpu)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2ce18584dca3..01259611beb9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -108,10 +108,17 @@ extern __read_mostly int scheduler_running;
extern unsigned long calc_load_update;
extern atomic_long_t calc_load_tasks;
+extern unsigned int sysctl_sched_child_runs_first;
+
extern void calc_global_load_tick(struct rq *this_rq);
extern long calc_load_fold_active(struct rq *this_rq, long adjust);
extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
+
+extern unsigned int sysctl_sched_rt_period;
+extern int sysctl_sched_rt_runtime;
+extern int sched_rr_timeslice;
+
/*
* Helpers for converting nanosecond timing to jiffy resolution
*/
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 810750e62118..05b6c2ad90b9 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -206,7 +206,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
DEFINE_STATIC_KEY_FALSE(sched_energy_present);
-unsigned int sysctl_sched_energy_aware = 1;
+static unsigned int sysctl_sched_energy_aware = 1;
DEFINE_MUTEX(sched_energy_mutex);
bool sched_energy_update;
@@ -220,7 +220,7 @@ void rebuild_sched_domains_energy(void)
}
#ifdef CONFIG_PROC_SYSCTL
-int sched_energy_aware_handler(struct ctl_table *table, int write,
+static int sched_energy_aware_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret, state;
@@ -237,6 +237,27 @@ int sched_energy_aware_handler(struct ctl_table *table, int write,
return ret;
}
+
+static struct ctl_table sched_energy_aware_sysctls[] = {
+ {
+ .procname = "sched_energy_aware",
+ .data = &sysctl_sched_energy_aware,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_energy_aware_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {}
+};
+
+static int __init sched_energy_aware_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_energy_aware_sysctls);
+ return 0;
+}
+
+late_initcall(sched_energy_aware_sysctl_init);
#endif
static void free_pd(struct perf_domain *pd)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index b5ac87f6dbd4..e9852d1b4a5e 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -200,6 +200,8 @@ static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
* the filter can be freed.
* @cache: cache of arch/syscall mappings to actions
* @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
+ * @wait_killable_recv: Put notifying process in killable state once the
+ * notification is received by the userspace listener.
* @prev: points to a previously installed, or inherited, filter
* @prog: the BPF program to evaluate
* @notif: the struct that holds all notification related information
@@ -220,6 +222,7 @@ struct seccomp_filter {
refcount_t refs;
refcount_t users;
bool log;
+ bool wait_killable_recv;
struct action_cache cache;
struct seccomp_filter *prev;
struct bpf_prog *prog;
@@ -893,6 +896,10 @@ static long seccomp_attach_filter(unsigned int flags,
if (flags & SECCOMP_FILTER_FLAG_LOG)
filter->log = true;
+ /* Set wait killable flag, if present. */
+ if (flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
+ filter->wait_killable_recv = true;
+
/*
* If there is an existing filter, make it the prev and don't drop its
* task reference.
@@ -1080,6 +1087,12 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn
complete(&addfd->completion);
}
+static bool should_sleep_killable(struct seccomp_filter *match,
+ struct seccomp_knotif *n)
+{
+ return match->wait_killable_recv && n->state == SECCOMP_NOTIFY_SENT;
+}
+
static int seccomp_do_user_notification(int this_syscall,
struct seccomp_filter *match,
const struct seccomp_data *sd)
@@ -1100,7 +1113,7 @@ static int seccomp_do_user_notification(int this_syscall,
n.data = sd;
n.id = seccomp_next_notify_id(match);
init_completion(&n.ready);
- list_add(&n.list, &match->notif->notifications);
+ list_add_tail(&n.list, &match->notif->notifications);
INIT_LIST_HEAD(&n.addfd);
up(&match->notif->request);
@@ -1110,11 +1123,25 @@ static int seccomp_do_user_notification(int this_syscall,
* This is where we wait for a reply from userspace.
*/
do {
+ bool wait_killable = should_sleep_killable(match, &n);
+
mutex_unlock(&match->notify_lock);
- err = wait_for_completion_interruptible(&n.ready);
+ if (wait_killable)
+ err = wait_for_completion_killable(&n.ready);
+ else
+ err = wait_for_completion_interruptible(&n.ready);
mutex_lock(&match->notify_lock);
- if (err != 0)
+
+ if (err != 0) {
+ /*
+ * Check to see if the notifcation got picked up and
+ * whether we should switch to wait killable.
+ */
+ if (!wait_killable && should_sleep_killable(match, &n))
+ continue;
+
goto interrupted;
+ }
addfd = list_first_entry_or_null(&n.addfd,
struct seccomp_kaddfd, list);
@@ -1484,6 +1511,9 @@ out:
mutex_lock(&filter->notify_lock);
knotif = find_notification(filter, unotif.id);
if (knotif) {
+ /* Reset the process to make sure it's not stuck */
+ if (should_sleep_killable(filter, knotif))
+ complete(&knotif->ready);
knotif->state = SECCOMP_NOTIFY_INIT;
up(&filter->notif->request);
}
@@ -1829,6 +1859,14 @@ static long seccomp_set_mode_filter(unsigned int flags,
((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == 0))
return -EINVAL;
+ /*
+ * The SECCOMP_FILTER_FLAG_WAIT_KILLABLE_SENT flag doesn't make sense
+ * without the SECCOMP_FILTER_FLAG_NEW_LISTENER flag.
+ */
+ if ((flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) &&
+ ((flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) == 0))
+ return -EINVAL;
+
/* Prepare the new filter before holding any locks. */
prepared = seccomp_prepare_user_filter(filter);
if (IS_ERR(prepared))
diff --git a/kernel/signal.c b/kernel/signal.c
index e43bc2a692f5..edb1dc9b00dc 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -762,7 +762,10 @@ still_pending:
*/
void signal_wake_up_state(struct task_struct *t, unsigned int state)
{
+ lockdep_assert_held(&t->sighand->siglock);
+
set_tsk_thread_flag(t, TIF_SIGPENDING);
+
/*
* TASK_WAKEKILL also means wake it up in the stopped/traced/killable
* case. We don't check t->state here because there is a race with it
@@ -884,7 +887,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info,
static void ptrace_trap_notify(struct task_struct *t)
{
WARN_ON_ONCE(!(t->ptrace & PT_SEIZED));
- assert_spin_locked(&t->sighand->siglock);
+ lockdep_assert_held(&t->sighand->siglock);
task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
@@ -930,9 +933,10 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
for_each_thread(p, t) {
flush_sigqueue_mask(&flush, &t->pending);
task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
- if (likely(!(t->ptrace & PT_SEIZED)))
+ if (likely(!(t->ptrace & PT_SEIZED))) {
+ t->jobctl &= ~JOBCTL_STOPPED;
wake_up_state(t, __TASK_STOPPED);
- else
+ } else
ptrace_trap_notify(t);
}
@@ -1071,15 +1075,15 @@ static inline bool legacy_queue(struct sigpending *signals, int sig)
return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
}
-static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t,
- enum pid_type type, bool force)
+static int __send_signal_locked(int sig, struct kernel_siginfo *info,
+ struct task_struct *t, enum pid_type type, bool force)
{
struct sigpending *pending;
struct sigqueue *q;
int override_rlimit;
int ret = 0, result;
- assert_spin_locked(&t->sighand->siglock);
+ lockdep_assert_held(&t->sighand->siglock);
result = TRACE_SIGNAL_IGNORED;
if (!prepare_signal(sig, t, force))
@@ -1212,8 +1216,8 @@ static inline bool has_si_pid_and_uid(struct kernel_siginfo *info)
return ret;
}
-static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t,
- enum pid_type type)
+int send_signal_locked(int sig, struct kernel_siginfo *info,
+ struct task_struct *t, enum pid_type type)
{
/* Should SIGKILL or SIGSTOP be received by a pid namespace init? */
bool force = false;
@@ -1245,7 +1249,7 @@ static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct
force = true;
}
}
- return __send_signal(sig, info, t, type, force);
+ return __send_signal_locked(sig, info, t, type, force);
}
static void print_fatal_signal(int signr)
@@ -1281,12 +1285,6 @@ static int __init setup_print_fatal_signals(char *str)
__setup("print-fatal-signals=", setup_print_fatal_signals);
-int
-__group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p)
-{
- return send_signal(sig, info, p, PIDTYPE_TGID);
-}
-
int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p,
enum pid_type type)
{
@@ -1294,7 +1292,7 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p
int ret = -ESRCH;
if (lock_task_sighand(p, &flags)) {
- ret = send_signal(sig, info, p, type);
+ ret = send_signal_locked(sig, info, p, type);
unlock_task_sighand(p, &flags);
}
@@ -1347,7 +1345,7 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t,
if (action->sa.sa_handler == SIG_DFL &&
(!t->ptrace || (handler == HANDLER_EXIT)))
t->signal->flags &= ~SIGNAL_UNKILLABLE;
- ret = send_signal(sig, info, t, PIDTYPE_PID);
+ ret = send_signal_locked(sig, info, t, PIDTYPE_PID);
spin_unlock_irqrestore(&t->sighand->siglock, flags);
return ret;
@@ -1567,7 +1565,7 @@ int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr,
if (sig) {
if (lock_task_sighand(p, &flags)) {
- ret = __send_signal(sig, &info, p, PIDTYPE_TGID, false);
+ ret = __send_signal_locked(sig, &info, p, PIDTYPE_TGID, false);
unlock_task_sighand(p, &flags);
} else
ret = -ESRCH;
@@ -2114,7 +2112,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
* parent's namespaces.
*/
if (valid_signal(sig) && sig)
- __send_signal(sig, &info, tsk->parent, PIDTYPE_TGID, false);
+ __send_signal_locked(sig, &info, tsk->parent, PIDTYPE_TGID, false);
__wake_up_parent(tsk, tsk->parent);
spin_unlock_irqrestore(&psig->siglock, flags);
@@ -2184,7 +2182,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
spin_lock_irqsave(&sighand->siglock, flags);
if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN &&
!(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP))
- __group_send_sig_info(SIGCHLD, &info, parent);
+ send_signal_locked(SIGCHLD, &info, parent, PIDTYPE_TGID);
/*
* Even if SIGCHLD is not generated, we must wake up wait4 calls.
*/
@@ -2204,13 +2202,12 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
* with. If the code did not stop because the tracer is gone,
* the stop signal remains unchanged unless clear_code.
*/
-static int ptrace_stop(int exit_code, int why, int clear_code,
- unsigned long message, kernel_siginfo_t *info)
+static int ptrace_stop(int exit_code, int why, unsigned long message,
+ kernel_siginfo_t *info)
__releases(&current->sighand->siglock)
__acquires(&current->sighand->siglock)
{
bool gstop_done = false;
- bool read_code = true;
if (arch_ptrace_stop_needed()) {
/*
@@ -2227,10 +2224,16 @@ static int ptrace_stop(int exit_code, int why, int clear_code,
}
/*
- * schedule() will not sleep if there is a pending signal that
- * can awaken the task.
+ * After this point ptrace_signal_wake_up or signal_wake_up
+ * will clear TASK_TRACED if ptrace_unlink happens or a fatal
+ * signal comes in. Handle previous ptrace_unlinks and fatal
+ * signals here to prevent ptrace_stop sleeping in schedule.
*/
+ if (!current->ptrace || __fatal_signal_pending(current))
+ return exit_code;
+
set_special_state(TASK_TRACED);
+ current->jobctl |= JOBCTL_TRACED;
/*
* We're committing to trapping. TRACED should be visible before
@@ -2276,54 +2279,33 @@ static int ptrace_stop(int exit_code, int why, int clear_code,
spin_unlock_irq(&current->sighand->siglock);
read_lock(&tasklist_lock);
- if (likely(current->ptrace)) {
- /*
- * Notify parents of the stop.
- *
- * While ptraced, there are two parents - the ptracer and
- * the real_parent of the group_leader. The ptracer should
- * know about every stop while the real parent is only
- * interested in the completion of group stop. The states
- * for the two don't interact with each other. Notify
- * separately unless they're gonna be duplicates.
- */
+ /*
+ * Notify parents of the stop.
+ *
+ * While ptraced, there are two parents - the ptracer and
+ * the real_parent of the group_leader. The ptracer should
+ * know about every stop while the real parent is only
+ * interested in the completion of group stop. The states
+ * for the two don't interact with each other. Notify
+ * separately unless they're gonna be duplicates.
+ */
+ if (current->ptrace)
do_notify_parent_cldstop(current, true, why);
- if (gstop_done && ptrace_reparented(current))
- do_notify_parent_cldstop(current, false, why);
+ if (gstop_done && (!current->ptrace || ptrace_reparented(current)))
+ do_notify_parent_cldstop(current, false, why);
- /*
- * Don't want to allow preemption here, because
- * sys_ptrace() needs this task to be inactive.
- *
- * XXX: implement read_unlock_no_resched().
- */
- preempt_disable();
- read_unlock(&tasklist_lock);
- cgroup_enter_frozen();
- preempt_enable_no_resched();
- freezable_schedule();
- cgroup_leave_frozen(true);
- } else {
- /*
- * By the time we got the lock, our tracer went away.
- * Don't drop the lock yet, another tracer may come.
- *
- * If @gstop_done, the ptracer went away between group stop
- * completion and here. During detach, it would have set
- * JOBCTL_STOP_PENDING on us and we'll re-enter
- * TASK_STOPPED in do_signal_stop() on return, so notifying
- * the real parent of the group stop completion is enough.
- */
- if (gstop_done)
- do_notify_parent_cldstop(current, false, why);
-
- /* tasklist protects us from ptrace_freeze_traced() */
- __set_current_state(TASK_RUNNING);
- read_code = false;
- if (clear_code)
- exit_code = 0;
- read_unlock(&tasklist_lock);
- }
+ /*
+ * Don't want to allow preemption here, because
+ * sys_ptrace() needs this task to be inactive.
+ *
+ * XXX: implement read_unlock_no_resched().
+ */
+ preempt_disable();
+ read_unlock(&tasklist_lock);
+ cgroup_enter_frozen();
+ preempt_enable_no_resched();
+ freezable_schedule();
+ cgroup_leave_frozen(true);
/*
* We are back. Now reacquire the siglock before touching
@@ -2331,14 +2313,13 @@ static int ptrace_stop(int exit_code, int why, int clear_code,
* any signal-sending on another CPU that wants to examine it.
*/
spin_lock_irq(&current->sighand->siglock);
- if (read_code)
- exit_code = current->exit_code;
+ exit_code = current->exit_code;
current->last_siginfo = NULL;
current->ptrace_message = 0;
current->exit_code = 0;
/* LISTENING can be set only during STOP traps, clear it */
- current->jobctl &= ~JOBCTL_LISTENING;
+ current->jobctl &= ~(JOBCTL_LISTENING | JOBCTL_PTRACE_FROZEN);
/*
* Queued signals ignored us while we were stopped for tracing.
@@ -2360,7 +2341,7 @@ static int ptrace_do_notify(int signr, int exit_code, int why, unsigned long mes
info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
/* Let the debugger run. */
- return ptrace_stop(exit_code, why, 1, message, &info);
+ return ptrace_stop(exit_code, why, message, &info);
}
int ptrace_notify(int exit_code, unsigned long message)
@@ -2471,6 +2452,7 @@ static bool do_signal_stop(int signr)
if (task_participate_group_stop(current))
notify = CLD_STOPPED;
+ current->jobctl |= JOBCTL_STOPPED;
set_special_state(TASK_STOPPED);
spin_unlock_irq(&current->sighand->siglock);
@@ -2532,7 +2514,7 @@ static void do_jobctl_trap(void)
CLD_STOPPED, 0);
} else {
WARN_ON_ONCE(!signr);
- ptrace_stop(signr, CLD_STOPPED, 0, 0, NULL);
+ ptrace_stop(signr, CLD_STOPPED, 0, NULL);
}
}
@@ -2585,7 +2567,7 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type)
* comment in dequeue_signal().
*/
current->jobctl |= JOBCTL_STOP_DEQUEUED;
- signr = ptrace_stop(signr, CLD_TRAPPED, 0, 0, info);
+ signr = ptrace_stop(signr, CLD_TRAPPED, 0, info);
/* We're back. Did the debugger cancel the sig? */
if (signr == 0)
@@ -2612,7 +2594,7 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type)
/* If the (new) signal is now blocked, requeue it. */
if (sigismember(&current->blocked, signr) ||
fatal_signal_pending(current)) {
- send_signal(signr, info, current, type);
+ send_signal_locked(signr, info, current, type);
signr = 0;
}
@@ -4807,7 +4789,7 @@ void kdb_send_sig(struct task_struct *t, int sig)
"the deadlock.\n");
return;
}
- ret = send_signal(sig, SEND_SIG_PRIV, t, PIDTYPE_PID);
+ ret = send_signal_locked(sig, SEND_SIG_PRIV, t, PIDTYPE_PID);
spin_unlock(&t->sighand->siglock);
if (ret)
kdb_printf("Fail to deliver Signal %d to process %d.\n",
diff --git a/kernel/stackleak.c b/kernel/stackleak.c
index ddb5a7f48d69..c2c33d2202e9 100644
--- a/kernel/stackleak.c
+++ b/kernel/stackleak.c
@@ -70,59 +70,81 @@ late_initcall(stackleak_sysctls_init);
#define skip_erasing() false
#endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */
-asmlinkage void noinstr stackleak_erase(void)
+static __always_inline void __stackleak_erase(bool on_task_stack)
{
- /* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */
- unsigned long kstack_ptr = current->lowest_stack;
- unsigned long boundary = (unsigned long)end_of_stack(current);
- unsigned int poison_count = 0;
- const unsigned int depth = STACKLEAK_SEARCH_DEPTH / sizeof(unsigned long);
-
- if (skip_erasing())
- return;
-
- /* Check that 'lowest_stack' value is sane */
- if (unlikely(kstack_ptr - boundary >= THREAD_SIZE))
- kstack_ptr = boundary;
+ const unsigned long task_stack_low = stackleak_task_low_bound(current);
+ const unsigned long task_stack_high = stackleak_task_high_bound(current);
+ unsigned long erase_low, erase_high;
- /* Search for the poison value in the kernel stack */
- while (kstack_ptr > boundary && poison_count <= depth) {
- if (*(unsigned long *)kstack_ptr == STACKLEAK_POISON)
- poison_count++;
- else
- poison_count = 0;
-
- kstack_ptr -= sizeof(unsigned long);
- }
-
- /*
- * One 'long int' at the bottom of the thread stack is reserved and
- * should not be poisoned (see CONFIG_SCHED_STACK_END_CHECK=y).
- */
- if (kstack_ptr == boundary)
- kstack_ptr += sizeof(unsigned long);
+ erase_low = stackleak_find_top_of_poison(task_stack_low,
+ current->lowest_stack);
#ifdef CONFIG_STACKLEAK_METRICS
- current->prev_lowest_stack = kstack_ptr;
+ current->prev_lowest_stack = erase_low;
#endif
/*
- * Now write the poison value to the kernel stack. Start from
- * 'kstack_ptr' and move up till the new 'boundary'. We assume that
- * the stack pointer doesn't change when we write poison.
+ * Write poison to the task's stack between 'erase_low' and
+ * 'erase_high'.
+ *
+ * If we're running on a different stack (e.g. an entry trampoline
+ * stack) we can erase everything below the pt_regs at the top of the
+ * task stack.
+ *
+ * If we're running on the task stack itself, we must not clobber any
+ * stack used by this function and its caller. We assume that this
+ * function has a fixed-size stack frame, and the current stack pointer
+ * doesn't change while we write poison.
*/
- if (on_thread_stack())
- boundary = current_stack_pointer;
+ if (on_task_stack)
+ erase_high = current_stack_pointer;
else
- boundary = current_top_of_stack();
+ erase_high = task_stack_high;
- while (kstack_ptr < boundary) {
- *(unsigned long *)kstack_ptr = STACKLEAK_POISON;
- kstack_ptr += sizeof(unsigned long);
+ while (erase_low < erase_high) {
+ *(unsigned long *)erase_low = STACKLEAK_POISON;
+ erase_low += sizeof(unsigned long);
}
/* Reset the 'lowest_stack' value for the next syscall */
- current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64;
+ current->lowest_stack = task_stack_high;
+}
+
+/*
+ * Erase and poison the portion of the task stack used since the last erase.
+ * Can be called from the task stack or an entry stack when the task stack is
+ * no longer in use.
+ */
+asmlinkage void noinstr stackleak_erase(void)
+{
+ if (skip_erasing())
+ return;
+
+ __stackleak_erase(on_thread_stack());
+}
+
+/*
+ * Erase and poison the portion of the task stack used since the last erase.
+ * Can only be called from the task stack.
+ */
+asmlinkage void noinstr stackleak_erase_on_task_stack(void)
+{
+ if (skip_erasing())
+ return;
+
+ __stackleak_erase(true);
+}
+
+/*
+ * Erase and poison the portion of the task stack used since the last erase.
+ * Can only be called from a stack other than the task stack.
+ */
+asmlinkage void noinstr stackleak_erase_off_task_stack(void)
+{
+ if (skip_erasing())
+ return;
+
+ __stackleak_erase(false);
}
void __used __no_caller_saved_registers noinstr stackleak_track_stack(void)
@@ -139,8 +161,7 @@ void __used __no_caller_saved_registers noinstr stackleak_track_stack(void)
/* 'lowest_stack' should be aligned on the register width boundary */
sp = ALIGN(sp, sizeof(unsigned long));
if (sp < current->lowest_stack &&
- sp >= (unsigned long)task_stack_page(current) +
- sizeof(unsigned long)) {
+ sp >= stackleak_task_low_bound(current)) {
current->lowest_stack = sp;
}
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5b7b1a82ae6a..e52b6e372c60 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -61,13 +61,9 @@
#include <linux/capability.h>
#include <linux/binfmts.h>
#include <linux/sched/sysctl.h>
-#include <linux/kexec.h>
-#include <linux/bpf.h>
#include <linux/mount.h>
#include <linux/userfaultfd_k.h>
-#include <linux/latencytop.h>
#include <linux/pid.h>
-#include <linux/delayacct.h>
#include "../lib/kstrtox.h"
@@ -82,15 +78,9 @@
#ifdef CONFIG_SPARC
#include <asm/setup.h>
#endif
-#ifdef CONFIG_BSD_PROCESS_ACCT
-#include <linux/acct.h>
-#endif
#ifdef CONFIG_RT_MUTEXES
#include <linux/rtmutex.h>
#endif
-#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT)
-#include <linux/lockdep.h>
-#endif
#if defined(CONFIG_SYSCTL)
@@ -100,8 +90,6 @@
static const int six_hundred_forty_kb = 640 * 1024;
#endif
-/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
-static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
static const int ngroups_max = NGROUPS_MAX;
static const int cap_last_cap = CAP_LAST_CAP;
@@ -148,66 +136,6 @@ static const int max_extfrag_threshold = 1000;
#endif /* CONFIG_SYSCTL */
-#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL)
-static int bpf_stats_handler(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- struct static_key *key = (struct static_key *)table->data;
- static int saved_val;
- int val, ret;
- struct ctl_table tmp = {
- .data = &val,
- .maxlen = sizeof(val),
- .mode = table->mode,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- };
-
- if (write && !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- mutex_lock(&bpf_stats_enabled_mutex);
- val = saved_val;
- ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
- if (write && !ret && val != saved_val) {
- if (val)
- static_key_slow_inc(key);
- else
- static_key_slow_dec(key);
- saved_val = val;
- }
- mutex_unlock(&bpf_stats_enabled_mutex);
- return ret;
-}
-
-void __weak unpriv_ebpf_notify(int new_state)
-{
-}
-
-static int bpf_unpriv_handler(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- int ret, unpriv_enable = *(int *)table->data;
- bool locked_state = unpriv_enable == 1;
- struct ctl_table tmp = *table;
-
- if (write && !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- tmp.data = &unpriv_enable;
- ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
- if (write && !ret) {
- if (locked_state && unpriv_enable != 1)
- return -EPERM;
- *(int *)table->data = unpriv_enable;
- }
-
- unpriv_ebpf_notify(unpriv_enable);
-
- return ret;
-}
-#endif /* CONFIG_BPF_SYSCALL && CONFIG_SYSCTL */
-
/*
* /proc/sys support
*/
@@ -1659,35 +1587,6 @@ int proc_do_static_key(struct ctl_table *table, int write,
}
static struct ctl_table kern_table[] = {
- {
- .procname = "sched_child_runs_first",
- .data = &sysctl_sched_child_runs_first,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#ifdef CONFIG_SCHEDSTATS
- {
- .procname = "sched_schedstats",
- .data = NULL,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sysctl_schedstats,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif /* CONFIG_SCHEDSTATS */
-#ifdef CONFIG_TASK_DELAY_ACCT
- {
- .procname = "task_delayacct",
- .data = NULL,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sysctl_delayacct,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif /* CONFIG_TASK_DELAY_ACCT */
#ifdef CONFIG_NUMA_BALANCING
{
.procname = "numa_balancing",
@@ -1700,103 +1599,6 @@ static struct ctl_table kern_table[] = {
},
#endif /* CONFIG_NUMA_BALANCING */
{
- .procname = "sched_rt_period_us",
- .data = &sysctl_sched_rt_period,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sched_rt_handler,
- },
- {
- .procname = "sched_rt_runtime_us",
- .data = &sysctl_sched_rt_runtime,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = sched_rt_handler,
- },
- {
- .procname = "sched_deadline_period_max_us",
- .data = &sysctl_sched_dl_period_max,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "sched_deadline_period_min_us",
- .data = &sysctl_sched_dl_period_min,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "sched_rr_timeslice_ms",
- .data = &sysctl_sched_rr_timeslice,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = sched_rr_handler,
- },
-#ifdef CONFIG_UCLAMP_TASK
- {
- .procname = "sched_util_clamp_min",
- .data = &sysctl_sched_uclamp_util_min,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sysctl_sched_uclamp_handler,
- },
- {
- .procname = "sched_util_clamp_max",
- .data = &sysctl_sched_uclamp_util_max,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sysctl_sched_uclamp_handler,
- },
- {
- .procname = "sched_util_clamp_min_rt_default",
- .data = &sysctl_sched_uclamp_util_min_rt_default,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sysctl_sched_uclamp_handler,
- },
-#endif
-#ifdef CONFIG_CFS_BANDWIDTH
- {
- .procname = "sched_cfs_bandwidth_slice_us",
- .data = &sysctl_sched_cfs_bandwidth_slice,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ONE,
- },
-#endif
-#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
- {
- .procname = "sched_energy_aware",
- .data = &sysctl_sched_energy_aware,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sched_energy_aware_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif
-#ifdef CONFIG_PROVE_LOCKING
- {
- .procname = "prove_locking",
- .data = &prove_locking,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
-#ifdef CONFIG_LOCK_STAT
- {
- .procname = "lock_stat",
- .data = &lock_stat,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
- {
.procname = "panic",
.data = &panic_timeout,
.maxlen = sizeof(int),
@@ -1820,24 +1622,6 @@ static struct ctl_table kern_table[] = {
.extra2 = SYSCTL_ONE,
},
#endif
-#ifdef CONFIG_LATENCYTOP
- {
- .procname = "latencytop",
- .data = &latencytop_enabled,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = sysctl_latencytop,
- },
-#endif
-#ifdef CONFIG_BLK_DEV_INITRD
- {
- .procname = "real-root-dev",
- .data = &real_root_dev,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
{
.procname = "print-fatal-signals",
.data = &print_fatal_signals,
@@ -1895,22 +1679,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
- {
- .procname = "ctrl-alt-del",
- .data = &C_A_D,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#ifdef CONFIG_FUNCTION_TRACER
- {
- .procname = "ftrace_enabled",
- .data = &ftrace_enabled,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = ftrace_enable_sysctl,
- },
-#endif
#ifdef CONFIG_STACK_TRACER
{
.procname = "stack_tracer_enabled",
@@ -1943,18 +1711,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = tracepoint_printk_sysctl,
},
#endif
-#ifdef CONFIG_KEXEC_CORE
- {
- .procname = "kexec_load_disabled",
- .data = &kexec_load_disabled,
- .maxlen = sizeof(int),
- .mode = 0644,
- /* only handle a transition from default "0" to "1" */
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ONE,
- .extra2 = SYSCTL_ONE,
- },
-#endif
#ifdef CONFIG_MODULES
{
.procname = "modprobe",
@@ -1983,15 +1739,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dostring,
},
#endif
-#ifdef CONFIG_BSD_PROCESS_ACCT
- {
- .procname = "acct",
- .data = &acct_parm,
- .maxlen = 3*sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
#ifdef CONFIG_MAGIC_SYSRQ
{
.procname = "sysrq",
@@ -2049,17 +1796,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#ifdef CONFIG_SMP
- {
- .procname = "oops_all_cpu_backtrace",
- .data = &sysctl_oops_all_cpu_backtrace,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif /* CONFIG_SMP */
{
.procname = "pid_max",
.data = &pid_max,
@@ -2208,13 +1944,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
- {
- .procname = "poweroff_cmd",
- .data = &poweroff_cmd,
- .maxlen = POWEROFF_CMD_PATH_LEN,
- .mode = 0644,
- .proc_handler = proc_dostring,
- },
#ifdef CONFIG_KEYS
{
.procname = "keys",
@@ -2288,24 +2017,6 @@ static struct ctl_table kern_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
-#ifdef CONFIG_BPF_SYSCALL
- {
- .procname = "unprivileged_bpf_disabled",
- .data = &sysctl_unprivileged_bpf_disabled,
- .maxlen = sizeof(sysctl_unprivileged_bpf_disabled),
- .mode = 0644,
- .proc_handler = bpf_unpriv_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_TWO,
- },
- {
- .procname = "bpf_stats_enabled",
- .data = &bpf_stats_enabled_key.key,
- .maxlen = sizeof(bpf_stats_enabled_key),
- .mode = 0644,
- .proc_handler = bpf_stats_handler,
- },
-#endif
#if defined(CONFIG_TREE_RCU)
{
.procname = "panic_on_rcu_stall",
@@ -2342,29 +2053,6 @@ static struct ctl_table vm_table[] = {
.extra2 = SYSCTL_TWO,
},
{
- .procname = "panic_on_oom",
- .data = &sysctl_panic_on_oom,
- .maxlen = sizeof(sysctl_panic_on_oom),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_TWO,
- },
- {
- .procname = "oom_kill_allocating_task",
- .data = &sysctl_oom_kill_allocating_task,
- .maxlen = sizeof(sysctl_oom_kill_allocating_task),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "oom_dump_tasks",
- .data = &sysctl_oom_dump_tasks,
- .maxlen = sizeof(sysctl_oom_dump_tasks),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
.procname = "overcommit_ratio",
.data = &sysctl_overcommit_ratio,
.maxlen = sizeof(sysctl_overcommit_ratio),
@@ -2387,55 +2075,6 @@ static struct ctl_table vm_table[] = {
.extra1 = SYSCTL_ZERO,
},
{
- .procname = "dirty_background_ratio",
- .data = &dirty_background_ratio,
- .maxlen = sizeof(dirty_background_ratio),
- .mode = 0644,
- .proc_handler = dirty_background_ratio_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE_HUNDRED,
- },
- {
- .procname = "dirty_background_bytes",
- .data = &dirty_background_bytes,
- .maxlen = sizeof(dirty_background_bytes),
- .mode = 0644,
- .proc_handler = dirty_background_bytes_handler,
- .extra1 = SYSCTL_LONG_ONE,
- },
- {
- .procname = "dirty_ratio",
- .data = &vm_dirty_ratio,
- .maxlen = sizeof(vm_dirty_ratio),
- .mode = 0644,
- .proc_handler = dirty_ratio_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE_HUNDRED,
- },
- {
- .procname = "dirty_bytes",
- .data = &vm_dirty_bytes,
- .maxlen = sizeof(vm_dirty_bytes),
- .mode = 0644,
- .proc_handler = dirty_bytes_handler,
- .extra1 = (void *)&dirty_bytes_min,
- },
- {
- .procname = "dirty_writeback_centisecs",
- .data = &dirty_writeback_interval,
- .maxlen = sizeof(dirty_writeback_interval),
- .mode = 0644,
- .proc_handler = dirty_writeback_centisecs_handler,
- },
- {
- .procname = "dirty_expire_centisecs",
- .data = &dirty_expire_interval,
- .maxlen = sizeof(dirty_expire_interval),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- },
- {
.procname = "dirtytime_expire_seconds",
.data = &dirtytime_expire_interval,
.maxlen = sizeof(dirtytime_expire_interval),
@@ -2607,13 +2246,6 @@ static struct ctl_table vm_table[] = {
},
#endif
{
- .procname = "laptop_mode",
- .data = &laptop_mode,
- .maxlen = sizeof(laptop_mode),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
.procname = "vfs_cache_pressure",
.data = &sysctl_vfs_cache_pressure,
.maxlen = sizeof(sysctl_vfs_cache_pressure),
@@ -2710,17 +2342,6 @@ static struct ctl_table vm_table[] = {
.extra1 = SYSCTL_ZERO,
},
#endif
-#ifdef CONFIG_HIGHMEM
- {
- .procname = "highmem_is_dirtyable",
- .data = &vm_highmem_is_dirtyable,
- .maxlen = sizeof(vm_highmem_is_dirtyable),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif
#ifdef CONFIG_MEMORY_FAILURE
{
.procname = "memory_failure_early_kill",
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index bcac5a9043aa..f7e246336218 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -9,6 +9,7 @@
#include <linux/kernel.h>
#include <linux/taskstats_kern.h>
#include <linux/tsacct_kern.h>
+#include <linux/acct.h>
#include <linux/delayacct.h>
#include <linux/cpumask.h>
#include <linux/percpu.h>
@@ -153,6 +154,23 @@ static void send_cpu_listeners(struct sk_buff *skb,
up_write(&listeners->sem);
}
+static void exe_add_tsk(struct taskstats *stats, struct task_struct *tsk)
+{
+ /* No idea if I'm allowed to access that here, now. */
+ struct file *exe_file = get_task_exe_file(tsk);
+
+ if (exe_file) {
+ /* Following cp_new_stat64() in stat.c . */
+ stats->ac_exe_dev =
+ huge_encode_dev(exe_file->f_inode->i_sb->s_dev);
+ stats->ac_exe_inode = exe_file->f_inode->i_ino;
+ fput(exe_file);
+ } else {
+ stats->ac_exe_dev = 0;
+ stats->ac_exe_inode = 0;
+ }
+}
+
static void fill_stats(struct user_namespace *user_ns,
struct pid_namespace *pid_ns,
struct task_struct *tsk, struct taskstats *stats)
@@ -175,6 +193,9 @@ static void fill_stats(struct user_namespace *user_ns,
/* fill in extended acct fields */
xacct_add_tsk(stats, tsk);
+
+ /* add executable info */
+ exe_add_tsk(stats, tsk);
}
static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
@@ -620,6 +641,8 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
goto err;
fill_stats(&init_user_ns, &init_pid_ns, tsk, stats);
+ if (group_dead)
+ stats->ac_flag |= AGROUP;
/*
* Doesn't matter if tsk is the leader or the last group member leaving
@@ -665,6 +688,7 @@ static struct genl_family family __ro_after_init = {
.module = THIS_MODULE,
.ops = taskstats_ops,
.n_ops = ARRAY_SIZE(taskstats_ops),
+ .netnsok = true,
};
/* Needed early in initialization */
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 0a97193984db..cb925e8ef9a8 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -870,7 +870,7 @@ static inline void check_dl_overrun(struct task_struct *tsk)
{
if (tsk->dl.dl_overrun) {
tsk->dl.dl_overrun = 0;
- __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
+ send_signal_locked(SIGXCPU, SEND_SIG_PRIV, tsk, PIDTYPE_TGID);
}
}
@@ -884,7 +884,7 @@ static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard)
rt ? "RT" : "CPU", hard ? "hard" : "soft",
current->comm, task_pid_nr(current));
}
- __group_send_sig_info(signo, SEND_SIG_PRIV, current);
+ send_signal_locked(signo, SEND_SIG_PRIV, current, PIDTYPE_TGID);
return true;
}
@@ -958,7 +958,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
trace_itimer_expire(signo == SIGPROF ?
ITIMER_PROF : ITIMER_VIRTUAL,
task_tgid(tsk), cur_time);
- __group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
+ send_signal_locked(signo, SEND_SIG_PRIV, tsk, PIDTYPE_TGID);
}
if (it->expires && it->expires < *expires)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 4ab9949772d5..8e4b3c32fcf9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -17,6 +17,7 @@
#include <linux/clocksource.h>
#include <linux/jiffies.h>
#include <linux/time.h>
+#include <linux/timex.h>
#include <linux/tick.h>
#include <linux/stop_machine.h>
#include <linux/pvclock_gtod.h>
@@ -2397,6 +2398,20 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc)
return 0;
}
+/**
+ * random_get_entropy_fallback - Returns the raw clock source value,
+ * used by random.c for platforms with no valid random_get_entropy().
+ */
+unsigned long random_get_entropy_fallback(void)
+{
+ struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono;
+ struct clocksource *clock = READ_ONCE(tkr->clock);
+
+ if (unlikely(timekeeping_suspended || !clock))
+ return 0;
+ return clock->read(clock);
+}
+EXPORT_SYMBOL_GPL(random_get_entropy_fallback);
/**
* do_adjtimex() - Accessor function to NTP __do_adjtimex function
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index a0666d948147..717fcb9fb14a 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1833,8 +1833,6 @@ void update_process_times(int user_tick)
{
struct task_struct *p = current;
- PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0);
-
/* Note: this timer irq context must be accounted for as well. */
account_process_tick(p, user_tick);
run_local_timers();
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d77cd8032213..0d261774d6f3 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -31,6 +31,10 @@ ifdef CONFIG_GCOV_PROFILE_FTRACE
GCOV_PROFILE := y
endif
+# Functions in this file could be invoked from early interrupt
+# code and produce random code coverage.
+KCOV_INSTRUMENT_trace_preemptirq.o := n
+
CFLAGS_bpf_trace.o := -I$(src)
CFLAGS_trace_benchmark.o := -I$(src)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d8553f46caa2..10b157a6d73e 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -129,7 +129,10 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
* out of events when it was updated in between this and the
* rcu_dereference() which is accepted risk.
*/
- ret = BPF_PROG_RUN_ARRAY(call->prog_array, ctx, bpf_prog_run);
+ rcu_read_lock();
+ ret = bpf_prog_run_array(rcu_dereference(call->prog_array),
+ ctx, bpf_prog_run);
+ rcu_read_unlock();
out:
__this_cpu_dec(bpf_prog_active);
@@ -1088,6 +1091,21 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = {
.arg1_type = ARG_PTR_TO_CTX,
};
+BPF_CALL_1(bpf_get_attach_cookie_tracing, void *, ctx)
+{
+ struct bpf_trace_run_ctx *run_ctx;
+
+ run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
+ return run_ctx->bpf_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_tracing = {
+ .func = bpf_get_attach_cookie_tracing,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags)
{
#ifndef CONFIG_X86
@@ -1179,6 +1197,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_map_pop_elem_proto;
case BPF_FUNC_map_peek_elem:
return &bpf_map_peek_elem_proto;
+ case BPF_FUNC_map_lookup_percpu_elem:
+ return &bpf_map_lookup_percpu_elem_proto;
case BPF_FUNC_ktime_get_ns:
return &bpf_ktime_get_ns_proto;
case BPF_FUNC_ktime_get_boot_ns:
@@ -1685,6 +1705,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skc_to_udp6_sock_proto;
case BPF_FUNC_skc_to_unix_sock:
return &bpf_skc_to_unix_sock_proto;
+ case BPF_FUNC_skc_to_mptcp_sock:
+ return &bpf_skc_to_mptcp_sock_proto;
case BPF_FUNC_sk_storage_get:
return &bpf_sk_storage_get_tracing_proto;
case BPF_FUNC_sk_storage_delete:
@@ -1716,6 +1738,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL;
case BPF_FUNC_get_func_arg_cnt:
return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL;
+ case BPF_FUNC_get_attach_cookie:
+ return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto_tracing : NULL;
default:
fn = raw_tp_prog_func_proto(func_id, prog);
if (!fn && prog->expected_attach_type == BPF_TRACE_ITER)
@@ -2226,6 +2250,59 @@ struct bpf_kprobe_multi_run_ctx {
unsigned long entry_ip;
};
+struct user_syms {
+ const char **syms;
+ char *buf;
+};
+
+static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, u32 cnt)
+{
+ unsigned long __user usymbol;
+ const char **syms = NULL;
+ char *buf = NULL, *p;
+ int err = -ENOMEM;
+ unsigned int i;
+
+ syms = kvmalloc(cnt * sizeof(*syms), GFP_KERNEL);
+ if (!syms)
+ goto error;
+
+ buf = kvmalloc(cnt * KSYM_NAME_LEN, GFP_KERNEL);
+ if (!buf)
+ goto error;
+
+ for (p = buf, i = 0; i < cnt; i++) {
+ if (__get_user(usymbol, usyms + i)) {
+ err = -EFAULT;
+ goto error;
+ }
+ err = strncpy_from_user(p, (const char __user *) usymbol, KSYM_NAME_LEN);
+ if (err == KSYM_NAME_LEN)
+ err = -E2BIG;
+ if (err < 0)
+ goto error;
+ syms[i] = p;
+ p += err + 1;
+ }
+
+ us->syms = syms;
+ us->buf = buf;
+ return 0;
+
+error:
+ if (err) {
+ kvfree(syms);
+ kvfree(buf);
+ }
+ return err;
+}
+
+static void free_user_syms(struct user_syms *us)
+{
+ kvfree(us->syms);
+ kvfree(us->buf);
+}
+
static void bpf_kprobe_multi_link_release(struct bpf_link *link)
{
struct bpf_kprobe_multi_link *kmulti_link;
@@ -2254,15 +2331,13 @@ static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void
const struct bpf_kprobe_multi_link *link = priv;
unsigned long *addr_a = a, *addr_b = b;
u64 *cookie_a, *cookie_b;
- unsigned long tmp1;
- u64 tmp2;
cookie_a = link->cookies + (addr_a - link->addrs);
cookie_b = link->cookies + (addr_b - link->addrs);
/* swap addr_a/addr_b and cookie_a/cookie_b values */
- tmp1 = *addr_a; *addr_a = *addr_b; *addr_b = tmp1;
- tmp2 = *cookie_a; *cookie_a = *cookie_b; *cookie_b = tmp2;
+ swap(*addr_a, *addr_b);
+ swap(*cookie_a, *cookie_b);
}
static int __bpf_kprobe_multi_cookie_cmp(const void *a, const void *b)
@@ -2348,53 +2423,12 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip,
kprobe_multi_link_prog_run(link, entry_ip, regs);
}
-static int
-kprobe_multi_resolve_syms(const void __user *usyms, u32 cnt,
- unsigned long *addrs)
+static int symbols_cmp(const void *a, const void *b)
{
- unsigned long addr, size;
- const char __user **syms;
- int err = -ENOMEM;
- unsigned int i;
- char *func;
-
- size = cnt * sizeof(*syms);
- syms = kvzalloc(size, GFP_KERNEL);
- if (!syms)
- return -ENOMEM;
-
- func = kmalloc(KSYM_NAME_LEN, GFP_KERNEL);
- if (!func)
- goto error;
-
- if (copy_from_user(syms, usyms, size)) {
- err = -EFAULT;
- goto error;
- }
-
- for (i = 0; i < cnt; i++) {
- err = strncpy_from_user(func, syms[i], KSYM_NAME_LEN);
- if (err == KSYM_NAME_LEN)
- err = -E2BIG;
- if (err < 0)
- goto error;
- err = -EINVAL;
- addr = kallsyms_lookup_name(func);
- if (!addr)
- goto error;
- if (!kallsyms_lookup_size_offset(addr, &size, NULL))
- goto error;
- addr = ftrace_location_range(addr, addr + size - 1);
- if (!addr)
- goto error;
- addrs[i] = addr;
- }
+ const char **str_a = (const char **) a;
+ const char **str_b = (const char **) b;
- err = 0;
-error:
- kvfree(syms);
- kfree(func);
- return err;
+ return strcmp(*str_a, *str_b);
}
int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
@@ -2440,7 +2474,15 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
goto error;
}
} else {
- err = kprobe_multi_resolve_syms(usyms, cnt, addrs);
+ struct user_syms us;
+
+ err = copy_user_syms(&us, usyms, cnt);
+ if (err)
+ goto error;
+
+ sort(us.syms, cnt, sizeof(*us.syms), symbols_cmp, NULL);
+ err = ftrace_lookup_symbols(us.syms, cnt, addrs);
+ free_user_syms(&us);
if (err)
goto error;
}
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 3fd5284f6487..218cd95bf8e4 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -30,6 +30,7 @@ int ftrace_graph_active;
/* Both enabled by default (can be cleared by function_graph tracer flags */
static bool fgraph_sleep_time = true;
+#ifdef CONFIG_DYNAMIC_FTRACE
/*
* archs can override this function if they must do something
* to enable hook for graph tracer.
@@ -47,6 +48,7 @@ int __weak ftrace_disable_ftrace_graph_caller(void)
{
return 0;
}
+#endif
/**
* ftrace_graph_stop - set to permanently disable function graph tracing
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 89d9f994ebb0..aac63ca9c3d1 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -85,39 +85,31 @@ static void fprobe_exit_handler(struct rethook_node *rh, void *data,
}
NOKPROBE_SYMBOL(fprobe_exit_handler);
+static int symbols_cmp(const void *a, const void *b)
+{
+ const char **str_a = (const char **) a;
+ const char **str_b = (const char **) b;
+
+ return strcmp(*str_a, *str_b);
+}
+
/* Convert ftrace location address from symbols */
static unsigned long *get_ftrace_locations(const char **syms, int num)
{
- unsigned long addr, size;
unsigned long *addrs;
- int i;
/* Convert symbols to symbol address */
addrs = kcalloc(num, sizeof(*addrs), GFP_KERNEL);
if (!addrs)
return ERR_PTR(-ENOMEM);
- for (i = 0; i < num; i++) {
- addr = kallsyms_lookup_name(syms[i]);
- if (!addr) /* Maybe wrong symbol */
- goto error;
-
- /* Convert symbol address to ftrace location. */
- if (!kallsyms_lookup_size_offset(addr, &size, NULL) || !size)
- goto error;
+ /* ftrace_lookup_symbols expects sorted symbols */
+ sort(syms, num, sizeof(*syms), symbols_cmp, NULL);
- addr = ftrace_location_range(addr, addr + size - 1);
- if (!addr) /* No dynamic ftrace there. */
- goto error;
+ if (!ftrace_lookup_symbols(syms, num, addrs))
+ return addrs;
- addrs[i] = addr;
- }
-
- return addrs;
-
-error:
kfree(addrs);
-
return ERR_PTR(-ENOENT);
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index af899b058c8d..e750fe141a60 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -45,6 +45,8 @@
#include "trace_output.h"
#include "trace_stat.h"
+#define FTRACE_INVALID_FUNCTION "__ftrace_invalid_address__"
+
#define FTRACE_WARN_ON(cond) \
({ \
int ___r = cond; \
@@ -86,7 +88,7 @@ struct ftrace_ops ftrace_list_end __read_mostly = {
/* ftrace_enabled is a method to turn ftrace on or off */
int ftrace_enabled __read_mostly;
-static int last_ftrace_enabled;
+static int __maybe_unused last_ftrace_enabled;
/* Current function tracing op */
struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
@@ -119,7 +121,7 @@ struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end;
ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
struct ftrace_ops global_ops;
-/* Defined by vmlinux.lds.h see the commment above arch_ftrace_ops_list_func for details */
+/* Defined by vmlinux.lds.h see the comment above arch_ftrace_ops_list_func for details */
void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs);
@@ -952,7 +954,6 @@ static struct tracer_stat function_stats __initdata = {
static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
{
struct ftrace_profile_stat *stat;
- struct dentry *entry;
char *name;
int ret;
int cpu;
@@ -983,11 +984,9 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
}
}
- entry = tracefs_create_file("function_profile_enabled",
- TRACE_MODE_WRITE, d_tracer, NULL,
- &ftrace_profile_fops);
- if (!entry)
- pr_warn("Could not create tracefs 'function_profile_enabled' entry\n");
+ trace_create_file("function_profile_enabled",
+ TRACE_MODE_WRITE, d_tracer, NULL,
+ &ftrace_profile_fops);
}
#else /* CONFIG_FUNCTION_PROFILER */
@@ -2707,18 +2706,16 @@ ftrace_nop_initialize(struct module *mod, struct dyn_ftrace *rec)
* archs can override this function if they must do something
* before the modifying code is performed.
*/
-int __weak ftrace_arch_code_modify_prepare(void)
+void __weak ftrace_arch_code_modify_prepare(void)
{
- return 0;
}
/*
* archs can override this function if they must do something
* after the modifying code is performed.
*/
-int __weak ftrace_arch_code_modify_post_process(void)
+void __weak ftrace_arch_code_modify_post_process(void)
{
- return 0;
}
void ftrace_modify_all_code(int command)
@@ -2804,12 +2801,7 @@ void __weak arch_ftrace_update_code(int command)
static void ftrace_run_update_code(int command)
{
- int ret;
-
- ret = ftrace_arch_code_modify_prepare();
- FTRACE_WARN_ON(ret);
- if (ret)
- return;
+ ftrace_arch_code_modify_prepare();
/*
* By default we use stop_machine() to modify the code.
@@ -2819,8 +2811,7 @@ static void ftrace_run_update_code(int command)
*/
arch_ftrace_update_code(command);
- ret = ftrace_arch_code_modify_post_process();
- FTRACE_WARN_ON(ret);
+ ftrace_arch_code_modify_post_process();
}
static void ftrace_run_modify_code(struct ftrace_ops *ops, int command,
@@ -3065,40 +3056,6 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
return 0;
}
-static void ftrace_startup_sysctl(void)
-{
- int command;
-
- if (unlikely(ftrace_disabled))
- return;
-
- /* Force update next time */
- saved_ftrace_func = NULL;
- /* ftrace_start_up is true if we want ftrace running */
- if (ftrace_start_up) {
- command = FTRACE_UPDATE_CALLS;
- if (ftrace_graph_active)
- command |= FTRACE_START_FUNC_RET;
- ftrace_startup_enable(command);
- }
-}
-
-static void ftrace_shutdown_sysctl(void)
-{
- int command;
-
- if (unlikely(ftrace_disabled))
- return;
-
- /* ftrace_start_up is true if ftrace is running */
- if (ftrace_start_up) {
- command = FTRACE_DISABLE_CALLS;
- if (ftrace_graph_active)
- command |= FTRACE_STOP_FUNC_RET;
- ftrace_run_update_code(command);
- }
-}
-
static u64 ftrace_update_time;
unsigned long ftrace_update_tot_cnt;
unsigned long ftrace_number_of_pages;
@@ -3665,6 +3622,105 @@ static void add_trampoline_func(struct seq_file *m, struct ftrace_ops *ops,
seq_printf(m, " ->%pS", ptr);
}
+#ifdef FTRACE_MCOUNT_MAX_OFFSET
+/*
+ * Weak functions can still have an mcount/fentry that is saved in
+ * the __mcount_loc section. These can be detected by having a
+ * symbol offset of greater than FTRACE_MCOUNT_MAX_OFFSET, as the
+ * symbol found by kallsyms is not the function that the mcount/fentry
+ * is part of. The offset is much greater in these cases.
+ *
+ * Test the record to make sure that the ip points to a valid kallsyms
+ * and if not, mark it disabled.
+ */
+static int test_for_valid_rec(struct dyn_ftrace *rec)
+{
+ char str[KSYM_SYMBOL_LEN];
+ unsigned long offset;
+ const char *ret;
+
+ ret = kallsyms_lookup(rec->ip, NULL, &offset, NULL, str);
+
+ /* Weak functions can cause invalid addresses */
+ if (!ret || offset > FTRACE_MCOUNT_MAX_OFFSET) {
+ rec->flags |= FTRACE_FL_DISABLED;
+ return 0;
+ }
+ return 1;
+}
+
+static struct workqueue_struct *ftrace_check_wq __initdata;
+static struct work_struct ftrace_check_work __initdata;
+
+/*
+ * Scan all the mcount/fentry entries to make sure they are valid.
+ */
+static __init void ftrace_check_work_func(struct work_struct *work)
+{
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
+
+ mutex_lock(&ftrace_lock);
+ do_for_each_ftrace_rec(pg, rec) {
+ test_for_valid_rec(rec);
+ } while_for_each_ftrace_rec();
+ mutex_unlock(&ftrace_lock);
+}
+
+static int __init ftrace_check_for_weak_functions(void)
+{
+ INIT_WORK(&ftrace_check_work, ftrace_check_work_func);
+
+ ftrace_check_wq = alloc_workqueue("ftrace_check_wq", WQ_UNBOUND, 0);
+
+ queue_work(ftrace_check_wq, &ftrace_check_work);
+ return 0;
+}
+
+static int __init ftrace_check_sync(void)
+{
+ /* Make sure the ftrace_check updates are finished */
+ if (ftrace_check_wq)
+ destroy_workqueue(ftrace_check_wq);
+ return 0;
+}
+
+late_initcall_sync(ftrace_check_sync);
+subsys_initcall(ftrace_check_for_weak_functions);
+
+static int print_rec(struct seq_file *m, unsigned long ip)
+{
+ unsigned long offset;
+ char str[KSYM_SYMBOL_LEN];
+ char *modname;
+ const char *ret;
+
+ ret = kallsyms_lookup(ip, NULL, &offset, &modname, str);
+ /* Weak functions can cause invalid addresses */
+ if (!ret || offset > FTRACE_MCOUNT_MAX_OFFSET) {
+ snprintf(str, KSYM_SYMBOL_LEN, "%s_%ld",
+ FTRACE_INVALID_FUNCTION, offset);
+ ret = NULL;
+ }
+
+ seq_puts(m, str);
+ if (modname)
+ seq_printf(m, " [%s]", modname);
+ return ret == NULL ? -1 : 0;
+}
+#else
+static inline int test_for_valid_rec(struct dyn_ftrace *rec)
+{
+ return 1;
+}
+
+static inline int print_rec(struct seq_file *m, unsigned long ip)
+{
+ seq_printf(m, "%ps", (void *)ip);
+ return 0;
+}
+#endif
+
static int t_show(struct seq_file *m, void *v)
{
struct ftrace_iterator *iter = m->private;
@@ -3689,7 +3745,13 @@ static int t_show(struct seq_file *m, void *v)
if (!rec)
return 0;
- seq_printf(m, "%ps", (void *)rec->ip);
+ if (print_rec(m, rec->ip)) {
+ /* This should only happen when a rec is disabled */
+ WARN_ON_ONCE(!(rec->flags & FTRACE_FL_DISABLED));
+ seq_putc(m, '\n');
+ return 0;
+ }
+
if (iter->flags & FTRACE_ITER_ENABLED) {
struct ftrace_ops *ops;
@@ -4007,6 +4069,24 @@ add_rec_by_index(struct ftrace_hash *hash, struct ftrace_glob *func_g,
return 0;
}
+#ifdef FTRACE_MCOUNT_MAX_OFFSET
+static int lookup_ip(unsigned long ip, char **modname, char *str)
+{
+ unsigned long offset;
+
+ kallsyms_lookup(ip, NULL, &offset, modname, str);
+ if (offset > FTRACE_MCOUNT_MAX_OFFSET)
+ return -1;
+ return 0;
+}
+#else
+static int lookup_ip(unsigned long ip, char **modname, char *str)
+{
+ kallsyms_lookup(ip, NULL, NULL, modname, str);
+ return 0;
+}
+#endif
+
static int
ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g,
struct ftrace_glob *mod_g, int exclude_mod)
@@ -4014,7 +4094,12 @@ ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g,
char str[KSYM_SYMBOL_LEN];
char *modname;
- kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
+ if (lookup_ip(rec->ip, &modname, str)) {
+ /* This should only happen when a rec is disabled */
+ WARN_ON_ONCE(system_state == SYSTEM_RUNNING &&
+ !(rec->flags & FTRACE_FL_DISABLED));
+ return 0;
+ }
if (mod_g) {
int mod_matches = (modname) ? ftrace_match(modname, mod_g) : 0;
@@ -4465,7 +4550,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper,
* @ip: The instruction pointer address to remove the data from
*
* Returns the data if it is found, otherwise NULL.
- * Note, if the data pointer is used as the data itself, (see
+ * Note, if the data pointer is used as the data itself, (see
* ftrace_func_mapper_find_ip(), then the return value may be meaningless,
* if the data pointer was set to zero.
*/
@@ -4560,8 +4645,8 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr,
struct ftrace_probe_ops *probe_ops,
void *data)
{
+ struct ftrace_func_probe *probe = NULL, *iter;
struct ftrace_func_entry *entry;
- struct ftrace_func_probe *probe;
struct ftrace_hash **orig_hash;
struct ftrace_hash *old_hash;
struct ftrace_hash *hash;
@@ -4580,11 +4665,13 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr,
mutex_lock(&ftrace_lock);
/* Check if the probe_ops is already registered */
- list_for_each_entry(probe, &tr->func_probes, list) {
- if (probe->probe_ops == probe_ops)
+ list_for_each_entry(iter, &tr->func_probes, list) {
+ if (iter->probe_ops == probe_ops) {
+ probe = iter;
break;
+ }
}
- if (&probe->list == &tr->func_probes) {
+ if (!probe) {
probe = kzalloc(sizeof(*probe), GFP_KERNEL);
if (!probe) {
mutex_unlock(&ftrace_lock);
@@ -4702,9 +4789,9 @@ int
unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,
struct ftrace_probe_ops *probe_ops)
{
+ struct ftrace_func_probe *probe = NULL, *iter;
struct ftrace_ops_hash old_hash_ops;
struct ftrace_func_entry *entry;
- struct ftrace_func_probe *probe;
struct ftrace_glob func_g;
struct ftrace_hash **orig_hash;
struct ftrace_hash *old_hash;
@@ -4732,11 +4819,13 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,
mutex_lock(&ftrace_lock);
/* Check if the probe_ops is already registered */
- list_for_each_entry(probe, &tr->func_probes, list) {
- if (probe->probe_ops == probe_ops)
+ list_for_each_entry(iter, &tr->func_probes, list) {
+ if (iter->probe_ops == probe_ops) {
+ probe = iter;
break;
+ }
}
- if (&probe->list == &tr->func_probes)
+ if (!probe)
goto err_unlock_ftrace;
ret = -EINVAL;
@@ -5195,8 +5284,6 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
goto out_unlock;
ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0);
- if (ret)
- remove_hash_entry(direct_functions, entry);
if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) {
ret = register_ftrace_function(&direct_ops);
@@ -5205,6 +5292,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
}
if (ret) {
+ remove_hash_entry(direct_functions, entry);
kfree(entry);
if (!direct->count) {
list_del_rcu(&direct->next);
@@ -6827,6 +6915,13 @@ void ftrace_module_enable(struct module *mod)
!within_module_init(rec->ip, mod))
break;
+ /* Weak functions should still be ignored */
+ if (!test_for_valid_rec(rec)) {
+ /* Clear all other flags. Should not be enabled anyway */
+ rec->flags = FTRACE_FL_DISABLED;
+ continue;
+ }
+
cnt = 0;
/*
@@ -6863,11 +6958,16 @@ void ftrace_module_enable(struct module *mod)
void ftrace_module_init(struct module *mod)
{
+ int ret;
+
if (ftrace_disabled || !mod->num_ftrace_callsites)
return;
- ftrace_process_locs(mod, mod->ftrace_callsites,
- mod->ftrace_callsites + mod->num_ftrace_callsites);
+ ret = ftrace_process_locs(mod, mod->ftrace_callsites,
+ mod->ftrace_callsites + mod->num_ftrace_callsites);
+ if (ret)
+ pr_warn("ftrace: failed to allocate entries for module '%s' functions\n",
+ mod->name);
}
static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
@@ -7200,15 +7300,19 @@ void __init ftrace_init(void)
pr_info("ftrace: allocating %ld entries in %ld pages\n",
count, count / ENTRIES_PER_PAGE + 1);
- last_ftrace_enabled = ftrace_enabled = 1;
-
ret = ftrace_process_locs(NULL,
__start_mcount_loc,
__stop_mcount_loc);
+ if (ret) {
+ pr_warn("ftrace: failed to allocate entries for functions\n");
+ goto failed;
+ }
pr_info("ftrace: allocated %ld pages with %ld groups\n",
ftrace_number_of_pages, ftrace_number_of_groups);
+ last_ftrace_enabled = ftrace_enabled = 1;
+
set_ftrace_early_filters();
return;
@@ -7267,9 +7371,6 @@ core_initcall(ftrace_nodyn_init);
static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; }
static inline void ftrace_startup_all(int command) { }
-# define ftrace_startup_sysctl() do { } while (0)
-# define ftrace_shutdown_sysctl() do { } while (0)
-
static void ftrace_update_trampoline(struct ftrace_ops *ops)
{
}
@@ -7909,6 +8010,109 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
}
EXPORT_SYMBOL_GPL(unregister_ftrace_function);
+static int symbols_cmp(const void *a, const void *b)
+{
+ const char **str_a = (const char **) a;
+ const char **str_b = (const char **) b;
+
+ return strcmp(*str_a, *str_b);
+}
+
+struct kallsyms_data {
+ unsigned long *addrs;
+ const char **syms;
+ size_t cnt;
+ size_t found;
+};
+
+static int kallsyms_callback(void *data, const char *name,
+ struct module *mod, unsigned long addr)
+{
+ struct kallsyms_data *args = data;
+
+ if (!bsearch(&name, args->syms, args->cnt, sizeof(*args->syms), symbols_cmp))
+ return 0;
+
+ addr = ftrace_location(addr);
+ if (!addr)
+ return 0;
+
+ args->addrs[args->found++] = addr;
+ return args->found == args->cnt ? 1 : 0;
+}
+
+/**
+ * ftrace_lookup_symbols - Lookup addresses for array of symbols
+ *
+ * @sorted_syms: array of symbols pointers symbols to resolve,
+ * must be alphabetically sorted
+ * @cnt: number of symbols/addresses in @syms/@addrs arrays
+ * @addrs: array for storing resulting addresses
+ *
+ * This function looks up addresses for array of symbols provided in
+ * @syms array (must be alphabetically sorted) and stores them in
+ * @addrs array, which needs to be big enough to store at least @cnt
+ * addresses.
+ *
+ * This function returns 0 if all provided symbols are found,
+ * -ESRCH otherwise.
+ */
+int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs)
+{
+ struct kallsyms_data args;
+ int err;
+
+ args.addrs = addrs;
+ args.syms = sorted_syms;
+ args.cnt = cnt;
+ args.found = 0;
+ err = kallsyms_on_each_symbol(kallsyms_callback, &args);
+ if (err < 0)
+ return err;
+ return args.found == args.cnt ? 0 : -ESRCH;
+}
+
+#ifdef CONFIG_SYSCTL
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+static void ftrace_startup_sysctl(void)
+{
+ int command;
+
+ if (unlikely(ftrace_disabled))
+ return;
+
+ /* Force update next time */
+ saved_ftrace_func = NULL;
+ /* ftrace_start_up is true if we want ftrace running */
+ if (ftrace_start_up) {
+ command = FTRACE_UPDATE_CALLS;
+ if (ftrace_graph_active)
+ command |= FTRACE_START_FUNC_RET;
+ ftrace_startup_enable(command);
+ }
+}
+
+static void ftrace_shutdown_sysctl(void)
+{
+ int command;
+
+ if (unlikely(ftrace_disabled))
+ return;
+
+ /* ftrace_start_up is true if ftrace is running */
+ if (ftrace_start_up) {
+ command = FTRACE_DISABLE_CALLS;
+ if (ftrace_graph_active)
+ command |= FTRACE_STOP_FUNC_RET;
+ ftrace_run_update_code(command);
+ }
+}
+#else
+# define ftrace_startup_sysctl() do { } while (0)
+# define ftrace_shutdown_sysctl() do { } while (0)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
static bool is_permanent_ops_registered(void)
{
struct ftrace_ops *op;
@@ -7921,7 +8125,7 @@ static bool is_permanent_ops_registered(void)
return false;
}
-int
+static int
ftrace_enable_sysctl(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -7964,3 +8168,22 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
mutex_unlock(&ftrace_lock);
return ret;
}
+
+static struct ctl_table ftrace_sysctls[] = {
+ {
+ .procname = "ftrace_enabled",
+ .data = &ftrace_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = ftrace_enable_sysctl,
+ },
+ {}
+};
+
+static int __init ftrace_sysctl_init(void)
+{
+ register_sysctl_init("kernel", ftrace_sysctls);
+ return 0;
+}
+late_initcall(ftrace_sysctl_init);
+#endif
diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
index a2ef1d18126a..95106d02b32d 100644
--- a/kernel/trace/pid_list.c
+++ b/kernel/trace/pid_list.c
@@ -118,9 +118,9 @@ static inline unsigned int pid_join(unsigned int upper1,
/**
* trace_pid_list_is_set - test if the pid is set in the list
* @pid_list: The pid list to test
- * @pid: The pid to to see if set in the list.
+ * @pid: The pid to see if set in the list.
*
- * Tests if @pid is is set in the @pid_list. This is usually called
+ * Tests if @pid is set in the @pid_list. This is usually called
* from the scheduler when a task is scheduled. Its pid is checked
* if it should be traced or not.
*
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 05dfc7a12d3d..d59b6a328b7f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -29,6 +29,14 @@
#include <asm/local.h>
+/*
+ * The "absolute" timestamp in the buffer is only 59 bits.
+ * If a clock has the 5 MSBs set, it needs to be saved and
+ * reinserted.
+ */
+#define TS_MSB (0xf8ULL << 56)
+#define ABS_TS_MASK (~TS_MSB)
+
static void update_pages_handler(struct work_struct *work);
/*
@@ -468,6 +476,7 @@ struct rb_time_struct {
local_t cnt;
local_t top;
local_t bottom;
+ local_t msb;
};
#else
#include <asm/local64.h>
@@ -569,7 +578,6 @@ struct ring_buffer_iter {
* For the ring buffer, 64 bit required operations for the time is
* the following:
*
- * - Only need 59 bits (uses 60 to make it even).
* - Reads may fail if it interrupted a modification of the time stamp.
* It will succeed if it did not interrupt another write even if
* the read itself is interrupted by a write.
@@ -594,6 +602,7 @@ struct ring_buffer_iter {
*/
#define RB_TIME_SHIFT 30
#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1)
+#define RB_TIME_MSB_SHIFT 60
static inline int rb_time_cnt(unsigned long val)
{
@@ -613,7 +622,7 @@ static inline u64 rb_time_val(unsigned long top, unsigned long bottom)
static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
{
- unsigned long top, bottom;
+ unsigned long top, bottom, msb;
unsigned long c;
/*
@@ -625,6 +634,7 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
c = local_read(&t->cnt);
top = local_read(&t->top);
bottom = local_read(&t->bottom);
+ msb = local_read(&t->msb);
} while (c != local_read(&t->cnt));
*cnt = rb_time_cnt(top);
@@ -633,7 +643,8 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
if (*cnt != rb_time_cnt(bottom))
return false;
- *ret = rb_time_val(top, bottom);
+ /* The shift to msb will lose its cnt bits */
+ *ret = rb_time_val(top, bottom) | ((u64)msb << RB_TIME_MSB_SHIFT);
return true;
}
@@ -649,10 +660,12 @@ static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt
return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT);
}
-static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom)
+static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom,
+ unsigned long *msb)
{
*top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK);
*bottom = (unsigned long)(val & RB_TIME_VAL_MASK);
+ *msb = (unsigned long)(val >> RB_TIME_MSB_SHIFT);
}
static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt)
@@ -663,15 +676,16 @@ static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long
static void rb_time_set(rb_time_t *t, u64 val)
{
- unsigned long cnt, top, bottom;
+ unsigned long cnt, top, bottom, msb;
- rb_time_split(val, &top, &bottom);
+ rb_time_split(val, &top, &bottom, &msb);
/* Writes always succeed with a valid number even if it gets interrupted. */
do {
cnt = local_inc_return(&t->cnt);
rb_time_val_set(&t->top, top, cnt);
rb_time_val_set(&t->bottom, bottom, cnt);
+ rb_time_val_set(&t->msb, val >> RB_TIME_MSB_SHIFT, cnt);
} while (cnt != local_read(&t->cnt));
}
@@ -686,8 +700,8 @@ rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
{
- unsigned long cnt, top, bottom;
- unsigned long cnt2, top2, bottom2;
+ unsigned long cnt, top, bottom, msb;
+ unsigned long cnt2, top2, bottom2, msb2;
u64 val;
/* The cmpxchg always fails if it interrupted an update */
@@ -703,16 +717,18 @@ static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
cnt2 = cnt + 1;
- rb_time_split(val, &top, &bottom);
+ rb_time_split(val, &top, &bottom, &msb);
top = rb_time_val_cnt(top, cnt);
bottom = rb_time_val_cnt(bottom, cnt);
- rb_time_split(set, &top2, &bottom2);
+ rb_time_split(set, &top2, &bottom2, &msb2);
top2 = rb_time_val_cnt(top2, cnt2);
bottom2 = rb_time_val_cnt(bottom2, cnt2);
if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2))
return false;
+ if (!rb_time_read_cmpxchg(&t->msb, msb, msb2))
+ return false;
if (!rb_time_read_cmpxchg(&t->top, top, top2))
return false;
if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2))
@@ -783,6 +799,24 @@ static inline void verify_event(struct ring_buffer_per_cpu *cpu_buffer,
}
#endif
+/*
+ * The absolute time stamp drops the 5 MSBs and some clocks may
+ * require them. The rb_fix_abs_ts() will take a previous full
+ * time stamp, and add the 5 MSB of that time stamp on to the
+ * saved absolute time stamp. Then they are compared in case of
+ * the unlikely event that the latest time stamp incremented
+ * the 5 MSB.
+ */
+static inline u64 rb_fix_abs_ts(u64 abs, u64 save_ts)
+{
+ if (save_ts & TS_MSB) {
+ abs |= save_ts & TS_MSB;
+ /* Check for overflow */
+ if (unlikely(abs < save_ts))
+ abs += 1ULL << 59;
+ }
+ return abs;
+}
static inline u64 rb_time_stamp(struct trace_buffer *buffer);
@@ -811,8 +845,10 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer,
u64 ts;
/* If the event includes an absolute time, then just use that */
- if (event->type_len == RINGBUF_TYPE_TIME_STAMP)
- return rb_event_time_stamp(event);
+ if (event->type_len == RINGBUF_TYPE_TIME_STAMP) {
+ ts = rb_event_time_stamp(event);
+ return rb_fix_abs_ts(ts, cpu_buffer->tail_page->page->time_stamp);
+ }
nest = local_read(&cpu_buffer->committing);
verify_event(cpu_buffer, event);
@@ -2754,8 +2790,15 @@ static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
(RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE);
if (unlikely(info->delta > (1ULL << 59))) {
+ /*
+ * Some timers can use more than 59 bits, and when a timestamp
+ * is added to the buffer, it will lose those bits.
+ */
+ if (abs && (info->ts & TS_MSB)) {
+ info->delta &= ABS_TS_MASK;
+
/* did the clock go backwards */
- if (info->before == info->after && info->before > info->ts) {
+ } else if (info->before == info->after && info->before > info->ts) {
/* not interrupted */
static int once;
@@ -3304,7 +3347,7 @@ static void dump_buffer_page(struct buffer_data_page *bpage,
case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
- ts = delta;
+ ts = rb_fix_abs_ts(delta, ts);
pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta);
break;
@@ -3380,7 +3423,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
- ts = delta;
+ ts = rb_fix_abs_ts(delta, ts);
break;
case RINGBUF_TYPE_PADDING:
@@ -4367,6 +4410,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
+ delta = rb_fix_abs_ts(delta, cpu_buffer->read_stamp);
cpu_buffer->read_stamp = delta;
return;
@@ -4397,6 +4441,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
+ delta = rb_fix_abs_ts(delta, iter->read_stamp);
iter->read_stamp = delta;
return;
@@ -4650,6 +4695,7 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
case RINGBUF_TYPE_TIME_STAMP:
if (ts) {
*ts = rb_event_time_stamp(event);
+ *ts = rb_fix_abs_ts(*ts, reader->page->time_stamp);
ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
cpu_buffer->cpu, ts);
}
@@ -4741,6 +4787,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
case RINGBUF_TYPE_TIME_STAMP:
if (ts) {
*ts = rb_event_time_stamp(event);
+ *ts = rb_fix_abs_ts(*ts, iter->head_page->page->time_stamp);
ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
cpu_buffer->cpu, ts);
}
@@ -6011,10 +6058,10 @@ static __init int test_ringbuffer(void)
pr_info(" total events: %ld\n", total_lost + total_read);
pr_info(" recorded len bytes: %ld\n", total_len);
pr_info(" recorded size bytes: %ld\n", total_size);
- if (total_lost)
+ if (total_lost) {
pr_info(" With dropped events, record len and size may not match\n"
" alloced and written from above\n");
- if (!total_lost) {
+ } else {
if (RB_WARN_ON(buffer, total_len != total_alloc ||
total_size != total_written))
break;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 124f1897fd56..2c95992e2c71 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -721,13 +721,16 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
pos = 0;
ret = trace_get_user(&parser, ubuf, cnt, &pos);
- if (ret < 0 || !trace_parser_loaded(&parser))
+ if (ret < 0)
break;
read += ret;
ubuf += ret;
cnt -= ret;
+ if (!trace_parser_loaded(&parser))
+ break;
+
ret = -EINVAL;
if (kstrtoul(parser.buffer, 0, &val))
break;
@@ -753,7 +756,6 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
if (!nr_pids) {
/* Cleared the list of pids */
trace_pid_list_free(pid_list);
- read = ret;
pid_list = NULL;
}
@@ -1174,7 +1176,7 @@ void tracing_snapshot_cond(struct trace_array *tr, void *cond_data)
EXPORT_SYMBOL_GPL(tracing_snapshot_cond);
/**
- * tracing_snapshot_cond_data - get the user data associated with a snapshot
+ * tracing_cond_snapshot_data - get the user data associated with a snapshot
* @tr: The tracing instance
*
* When the user enables a conditional snapshot using
@@ -1542,6 +1544,7 @@ static struct {
{ ktime_get_mono_fast_ns, "mono", 1 },
{ ktime_get_raw_fast_ns, "mono_raw", 1 },
{ ktime_get_boot_fast_ns, "boot", 1 },
+ { ktime_get_tai_fast_ns, "tai", 1 },
ARCH_TRACE_CLOCKS
};
@@ -2835,7 +2838,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
}
EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
-static DEFINE_SPINLOCK(tracepoint_iter_lock);
+static DEFINE_RAW_SPINLOCK(tracepoint_iter_lock);
static DEFINE_MUTEX(tracepoint_printk_mutex);
static void output_printk(struct trace_event_buffer *fbuffer)
@@ -2863,14 +2866,14 @@ static void output_printk(struct trace_event_buffer *fbuffer)
event = &fbuffer->trace_file->event_call->event;
- spin_lock_irqsave(&tracepoint_iter_lock, flags);
+ raw_spin_lock_irqsave(&tracepoint_iter_lock, flags);
trace_seq_init(&iter->seq);
iter->ent = fbuffer->entry;
event_call->event.funcs->trace(iter, 0, event);
trace_seq_putc(&iter->seq, 0);
printk("%s", iter->seq.buffer);
- spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
+ raw_spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
}
int tracepoint_printk_sysctl(struct ctl_table *table, int write,
@@ -4249,7 +4252,7 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file
unsigned int flags)
{
bool tgid = flags & TRACE_ITER_RECORD_TGID;
- const char *space = " ";
+ static const char space[] = " ";
int prec = tgid ? 12 : 2;
print_event_info(buf, m);
@@ -4273,9 +4276,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
struct tracer *type = iter->trace;
unsigned long entries;
unsigned long total;
- const char *name = "preemption";
-
- name = type->name;
+ const char *name = type->name;
get_total_entries(buf, &total, &entries);
@@ -5469,7 +5470,7 @@ static const char readme_msg[] =
" error_log\t- error log for failed commands (that support it)\n"
" buffer_size_kb\t- view and modify size of per cpu buffer\n"
" buffer_total_size_kb - view total size of all cpu buffers\n\n"
- " trace_clock\t\t-change the clock used to order events\n"
+ " trace_clock\t\t- change the clock used to order events\n"
" local: Per cpu clock but may not be synced across CPUs\n"
" global: Synced across CPUs but slows tracing down.\n"
" counter: Not a clock, but just an increment\n"
@@ -5478,7 +5479,7 @@ static const char readme_msg[] =
#ifdef CONFIG_X86_64
" x86-tsc: TSC cycle counter\n"
#endif
- "\n timestamp_mode\t-view the mode used to timestamp events\n"
+ "\n timestamp_mode\t- view the mode used to timestamp events\n"
" delta: Delta difference against a buffer-wide timestamp\n"
" absolute: Absolute (standalone) timestamp\n"
"\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
@@ -6326,12 +6327,18 @@ static void tracing_set_nop(struct trace_array *tr)
tr->current_trace = &nop_trace;
}
+static bool tracer_options_updated;
+
static void add_tracer_options(struct trace_array *tr, struct tracer *t)
{
/* Only enable if the directory has been created already. */
if (!tr->dir)
return;
+ /* Only create trace option files after update_tracer_options finish */
+ if (!tracer_options_updated)
+ return;
+
create_trace_option_files(tr, t);
}
@@ -6448,7 +6455,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
{
struct trace_array *tr = filp->private_data;
char buf[MAX_TRACER_SIZE+1];
- int i;
+ char *name;
size_t ret;
int err;
@@ -6462,11 +6469,9 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
buf[cnt] = 0;
- /* strip ending whitespace. */
- for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
- buf[i] = 0;
+ name = strim(buf);
- err = tracing_set_tracer(tr, buf);
+ err = tracing_set_tracer(tr, name);
if (err)
return err;
@@ -9170,6 +9175,7 @@ static void __update_tracer_options(struct trace_array *tr)
static void update_tracer_options(struct trace_array *tr)
{
mutex_lock(&trace_types_lock);
+ tracer_options_updated = true;
__update_tracer_options(tr);
mutex_unlock(&trace_types_lock);
}
@@ -9602,6 +9608,7 @@ extern struct trace_eval_map *__stop_ftrace_eval_maps[];
static struct workqueue_struct *eval_map_wq __initdata;
static struct work_struct eval_map_work __initdata;
+static struct work_struct tracerfs_init_work __initdata;
static void __init eval_map_work_func(struct work_struct *work)
{
@@ -9627,6 +9634,8 @@ static int __init trace_eval_init(void)
return 0;
}
+subsys_initcall(trace_eval_init);
+
static int __init trace_eval_sync(void)
{
/* Make sure the eval map updates are finished */
@@ -9709,15 +9718,8 @@ static struct notifier_block trace_module_nb = {
};
#endif /* CONFIG_MODULES */
-static __init int tracer_init_tracefs(void)
+static __init void tracer_init_tracefs_work_func(struct work_struct *work)
{
- int ret;
-
- trace_access_lock_init();
-
- ret = tracing_init_dentry();
- if (ret)
- return 0;
event_trace_init();
@@ -9739,8 +9741,6 @@ static __init int tracer_init_tracefs(void)
trace_create_file("saved_tgids", TRACE_MODE_READ, NULL,
NULL, &tracing_saved_tgids_fops);
- trace_eval_init();
-
trace_create_eval_file(NULL);
#ifdef CONFIG_MODULES
@@ -9755,6 +9755,24 @@ static __init int tracer_init_tracefs(void)
create_trace_instances(NULL);
update_tracer_options(&global_trace);
+}
+
+static __init int tracer_init_tracefs(void)
+{
+ int ret;
+
+ trace_access_lock_init();
+
+ ret = tracing_init_dentry();
+ if (ret)
+ return 0;
+
+ if (eval_map_wq) {
+ INIT_WORK(&tracerfs_init_work, tracer_init_tracefs_work_func);
+ queue_work(eval_map_wq, &tracerfs_init_work);
+ } else {
+ tracer_init_tracefs_work_func(NULL);
+ }
return 0;
}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 07d990270e2a..ff816fb41e48 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1573,13 +1573,12 @@ struct enable_trigger_data {
};
extern int event_enable_trigger_print(struct seq_file *m,
- struct event_trigger_ops *ops,
- struct event_trigger_data *data);
-extern void event_enable_trigger_free(struct event_trigger_ops *ops,
struct event_trigger_data *data);
+extern void event_enable_trigger_free(struct event_trigger_data *data);
extern int event_enable_trigger_parse(struct event_command *cmd_ops,
struct trace_event_file *file,
- char *glob, char *cmd, char *param);
+ char *glob, char *cmd,
+ char *param_and_filter);
extern int event_enable_register_trigger(char *glob,
struct event_trigger_data *data,
struct trace_event_file *file);
@@ -1587,8 +1586,7 @@ extern void event_enable_unregister_trigger(char *glob,
struct event_trigger_data *test,
struct trace_event_file *file);
extern void trigger_data_free(struct event_trigger_data *data);
-extern int event_trigger_init(struct event_trigger_ops *ops,
- struct event_trigger_data *data);
+extern int event_trigger_init(struct event_trigger_data *data);
extern int trace_event_trigger_enable_disable(struct trace_event_file *file,
int trigger_enable);
extern void update_cond_flag(struct trace_event_file *file);
@@ -1629,10 +1627,11 @@ extern void event_trigger_reset_filter(struct event_command *cmd_ops,
extern int event_trigger_register(struct event_command *cmd_ops,
struct trace_event_file *file,
char *glob,
- char *cmd,
- char *trigger,
- struct event_trigger_data *trigger_data,
- int *n_registered);
+ struct event_trigger_data *trigger_data);
+extern void event_trigger_unregister(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob,
+ struct event_trigger_data *trigger_data);
/**
* struct event_trigger_ops - callbacks for trace event triggers
@@ -1686,12 +1685,9 @@ struct event_trigger_ops {
struct trace_buffer *buffer,
void *rec,
struct ring_buffer_event *rbe);
- int (*init)(struct event_trigger_ops *ops,
- struct event_trigger_data *data);
- void (*free)(struct event_trigger_ops *ops,
- struct event_trigger_data *data);
+ int (*init)(struct event_trigger_data *data);
+ void (*free)(struct event_trigger_data *data);
int (*print)(struct seq_file *m,
- struct event_trigger_ops *ops,
struct event_trigger_data *data);
};
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 0580287d7a0d..778200dd8ede 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -300,7 +300,7 @@ trace_boot_hist_add_handlers(struct xbc_node *hnode, char **bufp,
{
struct xbc_node *node;
const char *p, *handler;
- int ret;
+ int ret = 0;
handler = xbc_node_get_data(hnode);
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index e34e8182ee4b..076b447a1b88 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -255,19 +255,14 @@ static const struct file_operations dynamic_events_ops = {
/* Make a tracefs interface for controlling dynamic events */
static __init int init_dynamic_event(void)
{
- struct dentry *entry;
int ret;
ret = tracing_init_dentry();
if (ret)
return 0;
- entry = tracefs_create_file("dynamic_events", TRACE_MODE_WRITE, NULL,
- NULL, &dynamic_events_ops);
-
- /* Event list interface */
- if (!entry)
- pr_warn("Could not create tracefs 'dynamic_events' entry\n");
+ trace_create_file("dynamic_events", TRACE_MODE_WRITE, NULL,
+ NULL, &dynamic_events_ops);
return 0;
}
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 541aa13581b9..7d4478525c66 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -511,20 +511,17 @@ __eprobe_trace_func(struct eprobe_data *edata, void *rec)
* functions are just stubs to fulfill what is needed to use the trigger
* infrastructure.
*/
-static int eprobe_trigger_init(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+static int eprobe_trigger_init(struct event_trigger_data *data)
{
return 0;
}
-static void eprobe_trigger_free(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+static void eprobe_trigger_free(struct event_trigger_data *data)
{
}
static int eprobe_trigger_print(struct seq_file *m,
- struct event_trigger_ops *ops,
struct event_trigger_data *data)
{
/* Do not print eprobe event triggers */
@@ -549,7 +546,8 @@ static struct event_trigger_ops eprobe_trigger_ops = {
static int eprobe_trigger_cmd_parse(struct event_command *cmd_ops,
struct trace_event_file *file,
- char *glob, char *cmd, char *param)
+ char *glob, char *cmd,
+ char *param_and_filter)
{
return -1;
}
@@ -650,7 +648,7 @@ static struct trace_event_functions eprobe_funcs = {
static int disable_eprobe(struct trace_eprobe *ep,
struct trace_array *tr)
{
- struct event_trigger_data *trigger;
+ struct event_trigger_data *trigger = NULL, *iter;
struct trace_event_file *file;
struct eprobe_data *edata;
@@ -658,14 +656,16 @@ static int disable_eprobe(struct trace_eprobe *ep,
if (!file)
return -ENOENT;
- list_for_each_entry(trigger, &file->triggers, list) {
- if (!(trigger->flags & EVENT_TRIGGER_FL_PROBE))
+ list_for_each_entry(iter, &file->triggers, list) {
+ if (!(iter->flags & EVENT_TRIGGER_FL_PROBE))
continue;
- edata = trigger->private_data;
- if (edata->ep == ep)
+ edata = iter->private_data;
+ if (edata->ep == ep) {
+ trigger = iter;
break;
+ }
}
- if (list_entry_is_head(trigger, &file->triggers, list))
+ if (!trigger)
return -ENODEV;
list_del_rcu(&trigger->list);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f97de82d1342..181f08186d32 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -392,12 +392,6 @@ static void test_event_printk(struct trace_event_call *call)
if (!(dereference_flags & (1ULL << arg)))
goto next_arg;
- /* Check for __get_sockaddr */;
- if (str_has_prefix(fmt + i, "__get_sockaddr(")) {
- dereference_flags &= ~(1ULL << arg);
- goto next_arg;
- }
-
/* Find the REC-> in the argument */
c = strchr(fmt + i, ',');
r = strstr(fmt + i, "REC->");
@@ -413,7 +407,14 @@ static void test_event_printk(struct trace_event_call *call)
a = strchr(fmt + i, '&');
if ((a && (a < r)) || test_field(r, call))
dereference_flags &= ~(1ULL << arg);
+ } else if ((r = strstr(fmt + i, "__get_dynamic_array(")) &&
+ (!c || r < c)) {
+ dereference_flags &= ~(1ULL << arg);
+ } else if ((r = strstr(fmt + i, "__get_sockaddr(")) &&
+ (!c || r < c)) {
+ dereference_flags &= ~(1ULL << arg);
}
+
next_arg:
i--;
arg++;
@@ -1723,9 +1724,9 @@ static LIST_HEAD(event_subsystems);
static int subsystem_open(struct inode *inode, struct file *filp)
{
+ struct trace_subsystem_dir *dir = NULL, *iter_dir;
+ struct trace_array *tr = NULL, *iter_tr;
struct event_subsystem *system = NULL;
- struct trace_subsystem_dir *dir = NULL; /* Initialize for gcc */
- struct trace_array *tr;
int ret;
if (tracing_is_disabled())
@@ -1734,10 +1735,12 @@ static int subsystem_open(struct inode *inode, struct file *filp)
/* Make sure the system still exists */
mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
- list_for_each_entry(tr, &ftrace_trace_arrays, list) {
- list_for_each_entry(dir, &tr->systems, list) {
- if (dir == inode->i_private) {
+ list_for_each_entry(iter_tr, &ftrace_trace_arrays, list) {
+ list_for_each_entry(iter_dir, &iter_tr->systems, list) {
+ if (iter_dir == inode->i_private) {
/* Don't open systems with no events */
+ tr = iter_tr;
+ dir = iter_dir;
if (dir->nr_events) {
__get_system_dir(dir);
system = dir->subsystem;
@@ -1753,9 +1756,6 @@ static int subsystem_open(struct inode *inode, struct file *filp)
if (!system)
return -ENODEV;
- /* Some versions of gcc think dir can be uninitialized here */
- WARN_ON(!dir);
-
/* Still need to increment the ref count of the system */
if (trace_array_get(tr) < 0) {
put_system(dir);
@@ -2280,8 +2280,8 @@ static struct dentry *
event_subsystem_dir(struct trace_array *tr, const char *name,
struct trace_event_file *file, struct dentry *parent)
{
+ struct event_subsystem *system, *iter;
struct trace_subsystem_dir *dir;
- struct event_subsystem *system;
struct dentry *entry;
/* First see if we did not already create this dir */
@@ -2295,13 +2295,13 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
}
/* Now see if the system itself exists. */
- list_for_each_entry(system, &event_subsystems, list) {
- if (strcmp(system->name, name) == 0)
+ system = NULL;
+ list_for_each_entry(iter, &event_subsystems, list) {
+ if (strcmp(iter->name, name) == 0) {
+ system = iter;
break;
+ }
}
- /* Reset system variable when not found */
- if (&system->list == &event_subsystems)
- system = NULL;
dir = kmalloc(sizeof(*dir), GFP_KERNEL);
if (!dir)
@@ -3546,12 +3546,10 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
struct dentry *d_events;
struct dentry *entry;
- entry = tracefs_create_file("set_event", TRACE_MODE_WRITE, parent,
- tr, &ftrace_set_event_fops);
- if (!entry) {
- pr_warn("Could not create tracefs 'set_event' entry\n");
+ entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent,
+ tr, &ftrace_set_event_fops);
+ if (!entry)
return -ENOMEM;
- }
d_events = tracefs_create_dir("events", parent);
if (!d_events) {
@@ -3566,16 +3564,12 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
/* There are not as crucial, just warn if they are not created */
- entry = tracefs_create_file("set_event_pid", TRACE_MODE_WRITE, parent,
- tr, &ftrace_set_event_pid_fops);
- if (!entry)
- pr_warn("Could not create tracefs 'set_event_pid' entry\n");
+ trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent,
+ tr, &ftrace_set_event_pid_fops);
- entry = tracefs_create_file("set_event_notrace_pid",
- TRACE_MODE_WRITE, parent, tr,
- &ftrace_set_event_notrace_pid_fops);
- if (!entry)
- pr_warn("Could not create tracefs 'set_event_notrace_pid' entry\n");
+ trace_create_file("set_event_notrace_pid",
+ TRACE_MODE_WRITE, parent, tr,
+ &ftrace_set_event_notrace_pid_fops);
/* ring buffer internal formats */
trace_create_file("header_page", TRACE_MODE_READ, d_events,
@@ -3790,17 +3784,14 @@ static __init int event_trace_init_fields(void)
__init int event_trace_init(void)
{
struct trace_array *tr;
- struct dentry *entry;
int ret;
tr = top_trace_array();
if (!tr)
return -ENODEV;
- entry = tracefs_create_file("available_events", TRACE_MODE_READ,
- NULL, tr, &ftrace_avail_fops);
- if (!entry)
- pr_warn("Could not create tracefs 'available_events' entry\n");
+ trace_create_file("available_events", TRACE_MODE_READ,
+ NULL, tr, &ftrace_avail_fops);
ret = early_event_add_tracer(NULL, tr);
if (ret)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index b458a9afa2c0..4b1057ab9d96 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1816,7 +1816,7 @@ static void create_filter_finish(struct filter_parse_error *pe)
* create_filter - create a filter for a trace_event_call
* @tr: the trace array associated with these events
* @call: trace_event_call to create a filter for
- * @filter_str: filter string
+ * @filter_string: filter string
* @set_str: remember @filter_str and enable detailed error in filter
* @filterp: out param for created filter (always updated on return)
* Must be a pointer that references a NULL pointer.
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 44db5ba9cabb..48e82e141d54 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -2093,8 +2093,11 @@ static int init_var_ref(struct hist_field *ref_field,
return err;
free:
kfree(ref_field->system);
+ ref_field->system = NULL;
kfree(ref_field->event_name);
+ ref_field->event_name = NULL;
kfree(ref_field->name);
+ ref_field->name = NULL;
goto out;
}
@@ -2785,7 +2788,8 @@ static char *find_trigger_filter(struct hist_trigger_data *hist_data,
static struct event_command trigger_hist_cmd;
static int event_hist_trigger_parse(struct event_command *cmd_ops,
struct trace_event_file *file,
- char *glob, char *cmd, char *param);
+ char *glob, char *cmd,
+ char *param_and_filter);
static bool compatible_keys(struct hist_trigger_data *target_hist_data,
struct hist_trigger_data *hist_data,
@@ -4161,7 +4165,7 @@ static int create_val_field(struct hist_trigger_data *hist_data,
return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0);
}
-static const char *no_comm = "(no comm)";
+static const char no_comm[] = "(no comm)";
static u64 hist_field_execname(struct hist_field *hist_field,
struct tracing_map_elt *elt,
@@ -5252,7 +5256,7 @@ static void hist_trigger_show(struct seq_file *m,
seq_puts(m, "\n\n");
seq_puts(m, "# event histogram\n#\n# trigger info: ");
- data->ops->print(m, data->ops, data);
+ data->ops->print(m, data);
seq_puts(m, "#\n\n");
hist_data = data->private_data;
@@ -5484,7 +5488,7 @@ static void hist_trigger_debug_show(struct seq_file *m,
seq_puts(m, "\n\n");
seq_puts(m, "# event histogram\n#\n# trigger info: ");
- data->ops->print(m, data->ops, data);
+ data->ops->print(m, data);
seq_puts(m, "#\n\n");
hist_data = data->private_data;
@@ -5621,7 +5625,6 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
}
static int event_hist_trigger_print(struct seq_file *m,
- struct event_trigger_ops *ops,
struct event_trigger_data *data)
{
struct hist_trigger_data *hist_data = data->private_data;
@@ -5729,8 +5732,7 @@ static int event_hist_trigger_print(struct seq_file *m,
return 0;
}
-static int event_hist_trigger_init(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+static int event_hist_trigger_init(struct event_trigger_data *data)
{
struct hist_trigger_data *hist_data = data->private_data;
@@ -5758,8 +5760,7 @@ static void unregister_field_var_hists(struct hist_trigger_data *hist_data)
}
}
-static void event_hist_trigger_free(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+static void event_hist_trigger_free(struct event_trigger_data *data)
{
struct hist_trigger_data *hist_data = data->private_data;
@@ -5788,25 +5789,23 @@ static struct event_trigger_ops event_hist_trigger_ops = {
.free = event_hist_trigger_free,
};
-static int event_hist_trigger_named_init(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+static int event_hist_trigger_named_init(struct event_trigger_data *data)
{
data->ref++;
save_named_trigger(data->named_data->name, data);
- event_hist_trigger_init(ops, data->named_data);
+ event_hist_trigger_init(data->named_data);
return 0;
}
-static void event_hist_trigger_named_free(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+static void event_hist_trigger_named_free(struct event_trigger_data *data)
{
if (WARN_ON_ONCE(data->ref <= 0))
return;
- event_hist_trigger_free(ops, data->named_data);
+ event_hist_trigger_free(data->named_data);
data->ref--;
if (!data->ref) {
@@ -5933,6 +5932,48 @@ static bool hist_trigger_match(struct event_trigger_data *data,
return true;
}
+static bool existing_hist_update_only(char *glob,
+ struct event_trigger_data *data,
+ struct trace_event_file *file)
+{
+ struct hist_trigger_data *hist_data = data->private_data;
+ struct event_trigger_data *test, *named_data = NULL;
+ bool updated = false;
+
+ if (!hist_data->attrs->pause && !hist_data->attrs->cont &&
+ !hist_data->attrs->clear)
+ goto out;
+
+ if (hist_data->attrs->name) {
+ named_data = find_named_trigger(hist_data->attrs->name);
+ if (named_data) {
+ if (!hist_trigger_match(data, named_data, named_data,
+ true))
+ goto out;
+ }
+ }
+
+ if (hist_data->attrs->name && !named_data)
+ goto out;
+
+ list_for_each_entry(test, &file->triggers, list) {
+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ if (!hist_trigger_match(data, test, named_data, false))
+ continue;
+ if (hist_data->attrs->pause)
+ test->paused = true;
+ else if (hist_data->attrs->cont)
+ test->paused = false;
+ else if (hist_data->attrs->clear)
+ hist_clear(test);
+ updated = true;
+ goto out;
+ }
+ }
+ out:
+ return updated;
+}
+
static int hist_register_trigger(char *glob,
struct event_trigger_data *data,
struct trace_event_file *file)
@@ -5961,19 +6002,11 @@ static int hist_register_trigger(char *glob,
list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
- if (!hist_trigger_match(data, test, named_data, false))
- continue;
- if (hist_data->attrs->pause)
- test->paused = true;
- else if (hist_data->attrs->cont)
- test->paused = false;
- else if (hist_data->attrs->clear)
- hist_clear(test);
- else {
+ if (hist_trigger_match(data, test, named_data, false)) {
hist_err(tr, HIST_ERR_TRIGGER_EEXIST, 0);
ret = -EEXIST;
+ goto out;
}
- goto out;
}
}
new:
@@ -5993,7 +6026,7 @@ static int hist_register_trigger(char *glob,
}
if (data->ops->init) {
- ret = data->ops->init(data->ops, data);
+ ret = data->ops->init(data);
if (ret < 0)
goto out;
}
@@ -6012,8 +6045,6 @@ static int hist_register_trigger(char *glob,
if (named_data)
destroy_hist_data(hist_data);
-
- ret++;
out:
return ret;
}
@@ -6089,20 +6120,19 @@ static void hist_unregister_trigger(char *glob,
struct event_trigger_data *data,
struct trace_event_file *file)
{
+ struct event_trigger_data *test = NULL, *iter, *named_data = NULL;
struct hist_trigger_data *hist_data = data->private_data;
- struct event_trigger_data *test, *named_data = NULL;
- bool unregistered = false;
lockdep_assert_held(&event_mutex);
if (hist_data->attrs->name)
named_data = find_named_trigger(hist_data->attrs->name);
- list_for_each_entry(test, &file->triggers, list) {
- if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
- if (!hist_trigger_match(data, test, named_data, false))
+ list_for_each_entry(iter, &file->triggers, list) {
+ if (iter->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ if (!hist_trigger_match(data, iter, named_data, false))
continue;
- unregistered = true;
+ test = iter;
list_del_rcu(&test->list);
trace_event_trigger_enable_disable(file, 0);
update_cond_flag(file);
@@ -6110,11 +6140,11 @@ static void hist_unregister_trigger(char *glob,
}
}
- if (unregistered && test->ops->free)
- test->ops->free(test->ops, test);
+ if (test && test->ops->free)
+ test->ops->free(test);
if (hist_data->enable_timestamps) {
- if (!hist_data->remove || unregistered)
+ if (!hist_data->remove || test)
tracing_set_filter_buffering(file->tr, false);
}
}
@@ -6164,57 +6194,57 @@ static void hist_unreg_all(struct trace_event_file *file)
if (hist_data->enable_timestamps)
tracing_set_filter_buffering(file->tr, false);
if (test->ops->free)
- test->ops->free(test->ops, test);
+ test->ops->free(test);
}
}
}
static int event_hist_trigger_parse(struct event_command *cmd_ops,
struct trace_event_file *file,
- char *glob, char *cmd, char *param)
+ char *glob, char *cmd,
+ char *param_and_filter)
{
unsigned int hist_trigger_bits = TRACING_MAP_BITS_DEFAULT;
struct event_trigger_data *trigger_data;
struct hist_trigger_attrs *attrs;
- struct event_trigger_ops *trigger_ops;
struct hist_trigger_data *hist_data;
+ char *param, *filter, *p, *start;
struct synth_event *se;
const char *se_name;
- bool remove = false;
- char *trigger, *p, *start;
+ bool remove;
int ret = 0;
lockdep_assert_held(&event_mutex);
- WARN_ON(!glob);
+ if (WARN_ON(!glob))
+ return -EINVAL;
- if (strlen(glob)) {
+ if (glob[0]) {
hist_err_clear();
- last_cmd_set(file, param);
+ last_cmd_set(file, param_and_filter);
}
- if (!param)
- return -EINVAL;
+ remove = event_trigger_check_remove(glob);
- if (glob[0] == '!')
- remove = true;
+ if (event_trigger_empty_param(param_and_filter))
+ return -EINVAL;
/*
* separate the trigger from the filter (k:v [if filter])
* allowing for whitespace in the trigger
*/
- p = trigger = param;
+ p = param = param_and_filter;
do {
p = strstr(p, "if");
if (!p)
break;
- if (p == param)
+ if (p == param_and_filter)
return -EINVAL;
if (*(p - 1) != ' ' && *(p - 1) != '\t') {
p++;
continue;
}
- if (p >= param + strlen(param) - (sizeof("if") - 1) - 1)
+ if (p >= param_and_filter + strlen(param_and_filter) - (sizeof("if") - 1) - 1)
return -EINVAL;
if (*(p + sizeof("if") - 1) != ' ' && *(p + sizeof("if") - 1) != '\t') {
p++;
@@ -6224,24 +6254,24 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
} while (1);
if (!p)
- param = NULL;
+ filter = NULL;
else {
*(p - 1) = '\0';
- param = strstrip(p);
- trigger = strstrip(trigger);
+ filter = strstrip(p);
+ param = strstrip(param);
}
/*
* To simplify arithmetic expression parsing, replace occurrences of
* '.sym-offset' modifier with '.symXoffset'
*/
- start = strstr(trigger, ".sym-offset");
+ start = strstr(param, ".sym-offset");
while (start) {
*(start + 4) = 'X';
start = strstr(start + 11, ".sym-offset");
}
- attrs = parse_hist_trigger_attrs(file->tr, trigger);
+ attrs = parse_hist_trigger_attrs(file->tr, param);
if (IS_ERR(attrs))
return PTR_ERR(attrs);
@@ -6254,29 +6284,15 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
return PTR_ERR(hist_data);
}
- trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
-
- trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
+ trigger_data = event_trigger_alloc(cmd_ops, cmd, param, hist_data);
if (!trigger_data) {
ret = -ENOMEM;
goto out_free;
}
- trigger_data->count = -1;
- trigger_data->ops = trigger_ops;
- trigger_data->cmd_ops = cmd_ops;
-
- INIT_LIST_HEAD(&trigger_data->list);
- RCU_INIT_POINTER(trigger_data->filter, NULL);
-
- trigger_data->private_data = hist_data;
-
- /* if param is non-empty, it's supposed to be a filter */
- if (param && cmd_ops->set_filter) {
- ret = cmd_ops->set_filter(param, trigger_data, file);
- if (ret < 0)
- goto out_free;
- }
+ ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data);
+ if (ret < 0)
+ goto out_free;
if (remove) {
if (!have_hist_trigger_match(trigger_data, file))
@@ -6287,7 +6303,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
goto out_free;
}
- cmd_ops->unreg(glob+1, trigger_data, file);
+ event_trigger_unregister(cmd_ops, file, glob+1, trigger_data);
se_name = trace_event_name(file->event_call);
se = find_synth_event(se_name);
if (se)
@@ -6296,17 +6312,11 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
goto out_free;
}
- ret = cmd_ops->reg(glob, trigger_data, file);
- /*
- * The above returns on success the # of triggers registered,
- * but if it didn't register any it returns zero. Consider no
- * triggers registered a failure too.
- */
- if (!ret) {
- if (!(attrs->pause || attrs->cont || attrs->clear))
- ret = -ENOENT;
+ if (existing_hist_update_only(glob, trigger_data, file))
goto out_free;
- } else if (ret < 0)
+
+ ret = event_trigger_register(cmd_ops, file, glob, trigger_data);
+ if (ret < 0)
goto out_free;
if (get_named_trigger_data(trigger_data))
@@ -6331,18 +6341,15 @@ enable:
se = find_synth_event(se_name);
if (se)
se->ref++;
- /* Just return zero, not the number of registered triggers */
- ret = 0;
out:
if (ret == 0)
hist_err_clear();
return ret;
out_unreg:
- cmd_ops->unreg(glob+1, trigger_data, file);
+ event_trigger_unregister(cmd_ops, file, glob+1, trigger_data);
out_free:
- if (cmd_ops->set_filter)
- cmd_ops->set_filter(NULL, trigger_data, NULL);
+ event_trigger_reset_filter(cmd_ops, trigger_data);
remove_hist_vars(hist_data);
@@ -6463,7 +6470,7 @@ static void hist_enable_unreg_all(struct trace_event_file *file)
update_cond_flag(file);
trace_event_trigger_enable_disable(file, 0);
if (test->ops->free)
- test->ops->free(test->ops, test);
+ test->ops->free(test);
}
}
}
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 7eb9d04f1c2e..cb866c3141af 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -188,7 +188,7 @@ static int trigger_show(struct seq_file *m, void *v)
}
data = list_entry(v, struct event_trigger_data, list);
- data->ops->print(m, data->ops, data);
+ data->ops->print(m, data);
return 0;
}
@@ -432,7 +432,6 @@ event_trigger_print(const char *name, struct seq_file *m,
/**
* event_trigger_init - Generic event_trigger_ops @init implementation
- * @ops: The trigger ops associated with the trigger
* @data: Trigger-specific data
*
* Common implementation of event trigger initialization.
@@ -442,8 +441,7 @@ event_trigger_print(const char *name, struct seq_file *m,
*
* Return: 0 on success, errno otherwise
*/
-int event_trigger_init(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+int event_trigger_init(struct event_trigger_data *data)
{
data->ref++;
return 0;
@@ -451,7 +449,6 @@ int event_trigger_init(struct event_trigger_ops *ops,
/**
* event_trigger_free - Generic event_trigger_ops @free implementation
- * @ops: The trigger ops associated with the trigger
* @data: Trigger-specific data
*
* Common implementation of event trigger de-initialization.
@@ -460,8 +457,7 @@ int event_trigger_init(struct event_trigger_ops *ops,
* implementations.
*/
static void
-event_trigger_free(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+event_trigger_free(struct event_trigger_data *data)
{
if (WARN_ON_ONCE(data->ref <= 0))
return;
@@ -515,7 +511,7 @@ clear_event_triggers(struct trace_array *tr)
trace_event_trigger_enable_disable(file, 0);
list_del_rcu(&data->list);
if (data->ops->free)
- data->ops->free(data->ops, data);
+ data->ops->free(data);
}
}
}
@@ -581,19 +577,18 @@ static int register_trigger(char *glob,
}
if (data->ops->init) {
- ret = data->ops->init(data->ops, data);
+ ret = data->ops->init(data);
if (ret < 0)
goto out;
}
list_add_rcu(&data->list, &file->triggers);
- ret++;
update_cond_flag(file);
- if (trace_event_trigger_enable_disable(file, 1) < 0) {
+ ret = trace_event_trigger_enable_disable(file, 1);
+ if (ret < 0) {
list_del_rcu(&data->list);
update_cond_flag(file);
- ret--;
}
out:
return ret;
@@ -614,14 +609,13 @@ static void unregister_trigger(char *glob,
struct event_trigger_data *test,
struct trace_event_file *file)
{
- struct event_trigger_data *data;
- bool unregistered = false;
+ struct event_trigger_data *data = NULL, *iter;
lockdep_assert_held(&event_mutex);
- list_for_each_entry(data, &file->triggers, list) {
- if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) {
- unregistered = true;
+ list_for_each_entry(iter, &file->triggers, list) {
+ if (iter->cmd_ops->trigger_type == test->cmd_ops->trigger_type) {
+ data = iter;
list_del_rcu(&data->list);
trace_event_trigger_enable_disable(file, 0);
update_cond_flag(file);
@@ -629,8 +623,8 @@ static void unregister_trigger(char *glob,
}
}
- if (unregistered && data->ops->free)
- data->ops->free(data->ops, data);
+ if (data && data->ops->free)
+ data->ops->free(data);
}
/*
@@ -744,15 +738,15 @@ bool event_trigger_empty_param(const char *param)
/**
* event_trigger_separate_filter - separate an event trigger from a filter
- * @param: The param string containing trigger and possibly filter
- * @trigger: outparam, will be filled with a pointer to the trigger
+ * @param_and_filter: String containing trigger and possibly filter
+ * @param: outparam, will be filled with a pointer to the trigger
* @filter: outparam, will be filled with a pointer to the filter
* @param_required: Specifies whether or not the param string is required
*
* Given a param string of the form '[trigger] [if filter]', this
* function separates the filter from the trigger and returns the
- * trigger in *trigger and the filter in *filter. Either the *trigger
- * or the *filter may be set to NULL by this function - if not set to
+ * trigger in @param and the filter in @filter. Either the @param
+ * or the @filter may be set to NULL by this function - if not set to
* NULL, they will contain strings corresponding to the trigger and
* filter.
*
@@ -927,48 +921,37 @@ void event_trigger_reset_filter(struct event_command *cmd_ops,
* @cmd_ops: The event_command operations for the trigger
* @file: The event file for the trigger's event
* @glob: The trigger command string, with optional remove(!) operator
- * @cmd: The cmd string
- * @param: The param string
* @trigger_data: The trigger_data for the trigger
- * @n_registered: optional outparam, the number of triggers registered
*
* Register an event trigger. The @cmd_ops are used to call the
- * cmd_ops->reg() function which actually does the registration. The
- * cmd_ops->reg() function returns the number of triggers registered,
- * which is assigned to n_registered, if n_registered is non-NULL.
+ * cmd_ops->reg() function which actually does the registration.
*
* Return: 0 on success, errno otherwise
*/
int event_trigger_register(struct event_command *cmd_ops,
struct trace_event_file *file,
char *glob,
- char *cmd,
- char *param,
- struct event_trigger_data *trigger_data,
- int *n_registered)
+ struct event_trigger_data *trigger_data)
{
- int ret;
-
- if (n_registered)
- *n_registered = 0;
-
- ret = cmd_ops->reg(glob, trigger_data, file);
- /*
- * The above returns on success the # of functions enabled,
- * but if it didn't find any functions it returns zero.
- * Consider no functions a failure too.
- */
- if (!ret) {
- cmd_ops->unreg(glob, trigger_data, file);
- ret = -ENOENT;
- } else if (ret > 0) {
- if (n_registered)
- *n_registered = ret;
- /* Just return zero, not the number of enabled functions */
- ret = 0;
- }
+ return cmd_ops->reg(glob, trigger_data, file);
+}
- return ret;
+/**
+ * event_trigger_unregister - unregister an event trigger
+ * @cmd_ops: The event_command operations for the trigger
+ * @file: The event file for the trigger's event
+ * @glob: The trigger command string, with optional remove(!) operator
+ * @trigger_data: The trigger_data for the trigger
+ *
+ * Unregister an event trigger. The @cmd_ops are used to call the
+ * cmd_ops->unreg() function which actually does the unregistration.
+ */
+void event_trigger_unregister(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob,
+ struct event_trigger_data *trigger_data)
+{
+ cmd_ops->unreg(glob, trigger_data, file);
}
/*
@@ -981,7 +964,7 @@ int event_trigger_register(struct event_command *cmd_ops,
* @file: The trace_event_file associated with the event
* @glob: The raw string used to register the trigger
* @cmd: The cmd portion of the string used to register the trigger
- * @param: The params portion of the string used to register the trigger
+ * @param_and_filter: The param and filter portion of the string used to register the trigger
*
* Common implementation for event command parsing and trigger
* instantiation.
@@ -994,94 +977,53 @@ int event_trigger_register(struct event_command *cmd_ops,
static int
event_trigger_parse(struct event_command *cmd_ops,
struct trace_event_file *file,
- char *glob, char *cmd, char *param)
+ char *glob, char *cmd, char *param_and_filter)
{
struct event_trigger_data *trigger_data;
- struct event_trigger_ops *trigger_ops;
- char *trigger = NULL;
- char *number;
+ char *param, *filter;
+ bool remove;
int ret;
- /* separate the trigger from the filter (t:n [if filter]) */
- if (param && isdigit(param[0])) {
- trigger = strsep(&param, " \t");
- if (param) {
- param = skip_spaces(param);
- if (!*param)
- param = NULL;
- }
- }
+ remove = event_trigger_check_remove(glob);
- trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
+ ret = event_trigger_separate_filter(param_and_filter, &param, &filter, false);
+ if (ret)
+ return ret;
ret = -ENOMEM;
- trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
+ trigger_data = event_trigger_alloc(cmd_ops, cmd, param, file);
if (!trigger_data)
goto out;
- trigger_data->count = -1;
- trigger_data->ops = trigger_ops;
- trigger_data->cmd_ops = cmd_ops;
- trigger_data->private_data = file;
- INIT_LIST_HEAD(&trigger_data->list);
- INIT_LIST_HEAD(&trigger_data->named_list);
-
- if (glob[0] == '!') {
- cmd_ops->unreg(glob+1, trigger_data, file);
+ if (remove) {
+ event_trigger_unregister(cmd_ops, file, glob+1, trigger_data);
kfree(trigger_data);
ret = 0;
goto out;
}
- if (trigger) {
- number = strsep(&trigger, ":");
-
- ret = -EINVAL;
- if (!strlen(number))
- goto out_free;
-
- /*
- * We use the callback data field (which is a pointer)
- * as our counter.
- */
- ret = kstrtoul(number, 0, &trigger_data->count);
- if (ret)
- goto out_free;
- }
-
- if (!param) /* if param is non-empty, it's supposed to be a filter */
- goto out_reg;
-
- if (!cmd_ops->set_filter)
- goto out_reg;
+ ret = event_trigger_parse_num(param, trigger_data);
+ if (ret)
+ goto out_free;
- ret = cmd_ops->set_filter(param, trigger_data, file);
+ ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data);
if (ret < 0)
goto out_free;
- out_reg:
/* Up the trigger_data count to make sure reg doesn't free it on failure */
- event_trigger_init(trigger_ops, trigger_data);
- ret = cmd_ops->reg(glob, trigger_data, file);
- /*
- * The above returns on success the # of functions enabled,
- * but if it didn't find any functions it returns zero.
- * Consider no functions a failure too.
- */
- if (!ret) {
- cmd_ops->unreg(glob, trigger_data, file);
- ret = -ENOENT;
- } else if (ret > 0)
- ret = 0;
+ event_trigger_init(trigger_data);
+
+ ret = event_trigger_register(cmd_ops, file, glob, trigger_data);
+ if (ret)
+ goto out_free;
/* Down the counter of trigger_data or free it if not used anymore */
- event_trigger_free(trigger_ops, trigger_data);
+ event_trigger_free(trigger_data);
out:
return ret;
out_free:
- if (cmd_ops->set_filter)
- cmd_ops->set_filter(NULL, trigger_data, NULL);
+ event_trigger_reset_filter(cmd_ops, trigger_data);
kfree(trigger_data);
goto out;
}
@@ -1401,16 +1343,14 @@ traceoff_count_trigger(struct event_trigger_data *data,
}
static int
-traceon_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+traceon_trigger_print(struct seq_file *m, struct event_trigger_data *data)
{
return event_trigger_print("traceon", m, (void *)data->count,
data->filter_str);
}
static int
-traceoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+traceoff_trigger_print(struct seq_file *m, struct event_trigger_data *data)
{
return event_trigger_print("traceoff", m, (void *)data->count,
data->filter_str);
@@ -1521,8 +1461,7 @@ register_snapshot_trigger(char *glob,
}
static int
-snapshot_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+snapshot_trigger_print(struct seq_file *m, struct event_trigger_data *data)
{
return event_trigger_print("snapshot", m, (void *)data->count,
data->filter_str);
@@ -1617,8 +1556,7 @@ stacktrace_count_trigger(struct event_trigger_data *data,
}
static int
-stacktrace_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data)
{
return event_trigger_print("stacktrace", m, (void *)data->count,
data->filter_str);
@@ -1708,7 +1646,6 @@ event_enable_count_trigger(struct event_trigger_data *data,
}
int event_enable_trigger_print(struct seq_file *m,
- struct event_trigger_ops *ops,
struct event_trigger_data *data)
{
struct enable_trigger_data *enable_data = data->private_data;
@@ -1733,8 +1670,7 @@ int event_enable_trigger_print(struct seq_file *m,
return 0;
}
-void event_enable_trigger_free(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+void event_enable_trigger_free(struct event_trigger_data *data)
{
struct enable_trigger_data *enable_data = data->private_data;
@@ -1781,39 +1717,33 @@ static struct event_trigger_ops event_disable_count_trigger_ops = {
int event_enable_trigger_parse(struct event_command *cmd_ops,
struct trace_event_file *file,
- char *glob, char *cmd, char *param)
+ char *glob, char *cmd, char *param_and_filter)
{
struct trace_event_file *event_enable_file;
struct enable_trigger_data *enable_data;
struct event_trigger_data *trigger_data;
- struct event_trigger_ops *trigger_ops;
struct trace_array *tr = file->tr;
+ char *param, *filter;
+ bool enable, remove;
const char *system;
const char *event;
bool hist = false;
- char *trigger;
- char *number;
- bool enable;
int ret;
- if (!param)
- return -EINVAL;
+ remove = event_trigger_check_remove(glob);
- /* separate the trigger from the filter (s:e:n [if filter]) */
- trigger = strsep(&param, " \t");
- if (!trigger)
+ if (event_trigger_empty_param(param_and_filter))
return -EINVAL;
- if (param) {
- param = skip_spaces(param);
- if (!*param)
- param = NULL;
- }
- system = strsep(&trigger, ":");
- if (!trigger)
+ ret = event_trigger_separate_filter(param_and_filter, &param, &filter, true);
+ if (ret)
+ return ret;
+
+ system = strsep(&param, ":");
+ if (!param)
return -EINVAL;
- event = strsep(&trigger, ":");
+ event = strsep(&param, ":");
ret = -EINVAL;
event_enable_file = find_event_file(tr, system, event);
@@ -1829,32 +1759,24 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
#else
enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
#endif
- trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
-
ret = -ENOMEM;
- trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
- if (!trigger_data)
- goto out;
enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL);
- if (!enable_data) {
- kfree(trigger_data);
+ if (!enable_data)
goto out;
- }
-
- trigger_data->count = -1;
- trigger_data->ops = trigger_ops;
- trigger_data->cmd_ops = cmd_ops;
- INIT_LIST_HEAD(&trigger_data->list);
- RCU_INIT_POINTER(trigger_data->filter, NULL);
enable_data->hist = hist;
enable_data->enable = enable;
enable_data->file = event_enable_file;
- trigger_data->private_data = enable_data;
- if (glob[0] == '!') {
- cmd_ops->unreg(glob+1, trigger_data, file);
+ trigger_data = event_trigger_alloc(cmd_ops, cmd, param, enable_data);
+ if (!trigger_data) {
+ kfree(enable_data);
+ goto out;
+ }
+
+ if (remove) {
+ event_trigger_unregister(cmd_ops, file, glob+1, trigger_data);
kfree(trigger_data);
kfree(enable_data);
ret = 0;
@@ -1862,35 +1784,16 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
}
/* Up the trigger_data count to make sure nothing frees it on failure */
- event_trigger_init(trigger_ops, trigger_data);
-
- if (trigger) {
- number = strsep(&trigger, ":");
-
- ret = -EINVAL;
- if (!strlen(number))
- goto out_free;
-
- /*
- * We use the callback data field (which is a pointer)
- * as our counter.
- */
- ret = kstrtoul(number, 0, &trigger_data->count);
- if (ret)
- goto out_free;
- }
+ event_trigger_init(trigger_data);
- if (!param) /* if param is non-empty, it's supposed to be a filter */
- goto out_reg;
-
- if (!cmd_ops->set_filter)
- goto out_reg;
+ ret = event_trigger_parse_num(param, trigger_data);
+ if (ret)
+ goto out_free;
- ret = cmd_ops->set_filter(param, trigger_data, file);
+ ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data);
if (ret < 0)
goto out_free;
- out_reg:
/* Don't let event modules unload while probe registered */
ret = trace_event_try_get_ref(event_enable_file->event_call);
if (!ret) {
@@ -1901,32 +1804,23 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
ret = trace_event_enable_disable(event_enable_file, 1, 1);
if (ret < 0)
goto out_put;
- ret = cmd_ops->reg(glob, trigger_data, file);
- /*
- * The above returns on success the # of functions enabled,
- * but if it didn't find any functions it returns zero.
- * Consider no functions a failure too.
- */
- if (!ret) {
- ret = -ENOENT;
- goto out_disable;
- } else if (ret < 0)
+
+ ret = event_trigger_register(cmd_ops, file, glob, trigger_data);
+ if (ret)
goto out_disable;
- /* Just return zero, not the number of enabled functions */
- ret = 0;
- event_trigger_free(trigger_ops, trigger_data);
+
+ event_trigger_free(trigger_data);
out:
return ret;
-
out_disable:
trace_event_enable_disable(event_enable_file, 0, 1);
out_put:
trace_event_put_ref(event_enable_file->event_call);
out_free:
- if (cmd_ops->set_filter)
- cmd_ops->set_filter(NULL, trigger_data, NULL);
- event_trigger_free(trigger_ops, trigger_data);
+ event_trigger_reset_filter(cmd_ops, trigger_data);
+ event_trigger_free(trigger_data);
kfree(enable_data);
+
goto out;
}
@@ -1953,19 +1847,18 @@ int event_enable_register_trigger(char *glob,
}
if (data->ops->init) {
- ret = data->ops->init(data->ops, data);
+ ret = data->ops->init(data);
if (ret < 0)
goto out;
}
list_add_rcu(&data->list, &file->triggers);
- ret++;
update_cond_flag(file);
- if (trace_event_trigger_enable_disable(file, 1) < 0) {
+ ret = trace_event_trigger_enable_disable(file, 1);
+ if (ret < 0) {
list_del_rcu(&data->list);
update_cond_flag(file);
- ret--;
}
out:
return ret;
@@ -1976,19 +1869,18 @@ void event_enable_unregister_trigger(char *glob,
struct trace_event_file *file)
{
struct enable_trigger_data *test_enable_data = test->private_data;
+ struct event_trigger_data *data = NULL, *iter;
struct enable_trigger_data *enable_data;
- struct event_trigger_data *data;
- bool unregistered = false;
lockdep_assert_held(&event_mutex);
- list_for_each_entry(data, &file->triggers, list) {
- enable_data = data->private_data;
+ list_for_each_entry(iter, &file->triggers, list) {
+ enable_data = iter->private_data;
if (enable_data &&
- (data->cmd_ops->trigger_type ==
+ (iter->cmd_ops->trigger_type ==
test->cmd_ops->trigger_type) &&
(enable_data->file == test_enable_data->file)) {
- unregistered = true;
+ data = iter;
list_del_rcu(&data->list);
trace_event_trigger_enable_disable(file, 0);
update_cond_flag(file);
@@ -1996,8 +1888,8 @@ void event_enable_unregister_trigger(char *glob,
}
}
- if (unregistered && data->ops->free)
- data->ops->free(data->ops, data);
+ if (data && data->ops->free)
+ data->ops->free(data);
}
static struct event_trigger_ops *
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 47cebef78532..93507330462c 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1907,25 +1907,18 @@ core_initcall(init_kprobe_trace_early);
static __init int init_kprobe_trace(void)
{
int ret;
- struct dentry *entry;
ret = tracing_init_dentry();
if (ret)
return 0;
- entry = tracefs_create_file("kprobe_events", TRACE_MODE_WRITE,
- NULL, NULL, &kprobe_events_ops);
-
/* Event list interface */
- if (!entry)
- pr_warn("Could not create tracefs 'kprobe_events' entry\n");
+ trace_create_file("kprobe_events", TRACE_MODE_WRITE,
+ NULL, NULL, &kprobe_events_ops);
/* Profile interface */
- entry = tracefs_create_file("kprobe_profile", TRACE_MODE_READ,
- NULL, NULL, &kprobe_profile_ops);
-
- if (!entry)
- pr_warn("Could not create tracefs 'kprobe_profile' entry\n");
+ trace_create_file("kprobe_profile", TRACE_MODE_READ,
+ NULL, NULL, &kprobe_profile_ops);
setup_boot_kprobe_events();
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index afb92e2f0aea..313439920a8c 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -1578,11 +1578,27 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer)
trace_timerlat_sample(&s);
- notify_new_max_latency(diff);
+ if (osnoise_data.stop_tracing) {
+ if (time_to_us(diff) >= osnoise_data.stop_tracing) {
+
+ /*
+ * At this point, if stop_tracing is set and <= print_stack,
+ * print_stack is set and would be printed in the thread handler.
+ *
+ * Thus, print the stack trace as it is helpful to define the
+ * root cause of an IRQ latency.
+ */
+ if (osnoise_data.stop_tracing <= osnoise_data.print_stack) {
+ timerlat_save_stack(0);
+ timerlat_dump_stack(time_to_us(diff));
+ }
- if (osnoise_data.stop_tracing)
- if (time_to_us(diff) >= osnoise_data.stop_tracing)
osnoise_stop_tracing();
+ notify_new_max_latency(diff);
+
+ return HRTIMER_NORESTART;
+ }
+ }
wake_up_process(tlat->kthread);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8aa493d25c73..67f47ea27921 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -692,7 +692,7 @@ static LIST_HEAD(ftrace_event_list);
static int trace_search_list(struct list_head **list)
{
- struct trace_event *e;
+ struct trace_event *e = NULL, *iter;
int next = __TRACE_LAST_TYPE;
if (list_empty(&ftrace_event_list)) {
@@ -704,9 +704,11 @@ static int trace_search_list(struct list_head **list)
* We used up all possible max events,
* lets see if somebody freed one.
*/
- list_for_each_entry(e, &ftrace_event_list, list) {
- if (e->type != next)
+ list_for_each_entry(iter, &ftrace_event_list, list) {
+ if (iter->type != next) {
+ e = iter;
break;
+ }
next++;
}
@@ -714,7 +716,10 @@ static int trace_search_list(struct list_head **list)
if (next > TRACE_EVENT_TYPE_MAX)
return 0;
- *list = &e->list;
+ if (e)
+ *list = &e->list;
+ else
+ *list = &ftrace_event_list;
return next;
}
@@ -778,9 +783,8 @@ int register_trace_event(struct trace_event *event)
list_add_tail(&event->list, list);
- } else if (event->type > __TRACE_LAST_TYPE) {
- printk(KERN_WARNING "Need to add type to trace.h\n");
- WARN_ON(1);
+ } else if (WARN(event->type > __TRACE_LAST_TYPE,
+ "Need to add type to trace.h")) {
goto out;
} else {
/* Is this event already used */
@@ -1571,13 +1575,8 @@ __init static int init_events(void)
for (i = 0; events[i]; i++) {
event = events[i];
-
ret = register_trace_event(event);
- if (!ret) {
- printk(KERN_WARNING "event %d failed to register\n",
- event->type);
- WARN_ON_ONCE(1);
- }
+ WARN_ONCE(!ret, "event %d failed to register", event->type);
}
return 0;
diff --git a/kernel/trace/trace_recursion_record.c b/kernel/trace/trace_recursion_record.c
index 4d4b78c8ca25..a520b11afb0d 100644
--- a/kernel/trace/trace_recursion_record.c
+++ b/kernel/trace/trace_recursion_record.c
@@ -224,12 +224,9 @@ static const struct file_operations recursed_functions_fops = {
__init static int create_recursed_functions(void)
{
- struct dentry *dentry;
- dentry = trace_create_file("recursed_functions", TRACE_MODE_WRITE,
- NULL, NULL, &recursed_functions_fops);
- if (!dentry)
- pr_warn("WARNING: Failed to create recursed_functions\n");
+ trace_create_file("recursed_functions", TRACE_MODE_WRITE,
+ NULL, NULL, &recursed_functions_fops);
return 0;
}
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index abcadbe933bb..a2d301f58ced 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -895,6 +895,9 @@ trace_selftest_startup_function_graph(struct tracer *trace,
ret = -1;
goto out;
}
+
+ /* Enable tracing on all functions again */
+ ftrace_set_global_filter(NULL, 0, 1);
#endif
/* Don't test dynamic tracing, the function tracer already did */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index f755bde42fd0..b69e207012c9 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -154,7 +154,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
goto end;
/* parameter types */
- if (tr->trace_flags & TRACE_ITER_VERBOSE)
+ if (tr && tr->trace_flags & TRACE_ITER_VERBOSE)
trace_seq_printf(s, "%s ", entry->types[i]);
/* parameter values */
@@ -296,9 +296,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
struct trace_event_file *trace_file;
struct syscall_trace_enter *entry;
struct syscall_metadata *sys_data;
- struct ring_buffer_event *event;
- struct trace_buffer *buffer;
- unsigned int trace_ctx;
+ struct trace_event_buffer fbuffer;
unsigned long args[6];
int syscall_nr;
int size;
@@ -321,20 +319,16 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
- trace_ctx = tracing_gen_ctx();
-
- event = trace_event_buffer_lock_reserve(&buffer, trace_file,
- sys_data->enter_event->event.type, size, trace_ctx);
- if (!event)
+ entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
+ if (!entry)
return;
- entry = ring_buffer_event_data(event);
+ entry = ring_buffer_event_data(fbuffer.event);
entry->nr = syscall_nr;
syscall_get_arguments(current, regs, args);
memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
- event_trigger_unlock_commit(trace_file, buffer, event, entry,
- trace_ctx);
+ trace_event_buffer_commit(&fbuffer);
}
static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
@@ -343,9 +337,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
struct trace_event_file *trace_file;
struct syscall_trace_exit *entry;
struct syscall_metadata *sys_data;
- struct ring_buffer_event *event;
- struct trace_buffer *buffer;
- unsigned int trace_ctx;
+ struct trace_event_buffer fbuffer;
int syscall_nr;
syscall_nr = trace_get_syscall_nr(current, regs);
@@ -364,20 +356,15 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
if (!sys_data)
return;
- trace_ctx = tracing_gen_ctx();
-
- event = trace_event_buffer_lock_reserve(&buffer, trace_file,
- sys_data->exit_event->event.type, sizeof(*entry),
- trace_ctx);
- if (!event)
+ entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry));
+ if (!entry)
return;
- entry = ring_buffer_event_data(event);
+ entry = ring_buffer_event_data(fbuffer.event);
entry->nr = syscall_nr;
entry->ret = syscall_get_return_value(current, regs);
- event_trigger_unlock_commit(trace_file, buffer, event, entry,
- trace_ctx);
+ trace_event_buffer_commit(&fbuffer);
}
static int reg_event_syscall_enter(struct trace_event_file *file,
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 9628b5571846..9901708ce6b8 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -1045,7 +1045,8 @@ static void sort_secondary(struct tracing_map *map,
/**
* tracing_map_sort_entries - Sort the current set of tracing_map_elts in a map
* @map: The tracing_map
- * @sort_key: The sort key to use for sorting
+ * @sort_keys: The sort key to use for sorting
+ * @n_sort_keys: hitcount, always have at least one
* @sort_entries: outval: pointer to allocated and sorted array of entries
*
* tracing_map_sort_entries() sorts the current set of entries in the
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 1d261fbe367b..4252f0645b9e 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -23,15 +23,20 @@ void bacct_add_tsk(struct user_namespace *user_ns,
{
const struct cred *tcred;
u64 utime, stime, utimescaled, stimescaled;
- u64 delta;
+ u64 now_ns, delta;
time64_t btime;
BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
/* calculate task elapsed time in nsec */
- delta = ktime_get_ns() - tsk->start_time;
+ now_ns = ktime_get_ns();
+ /* store whole group time first */
+ delta = now_ns - tsk->group_leader->start_time;
/* Convert to micro seconds */
do_div(delta, NSEC_PER_USEC);
+ stats->ac_tgetime = delta;
+ delta = now_ns - tsk->start_time;
+ do_div(delta, NSEC_PER_USEC);
stats->ac_etime = delta;
/* Convert to seconds for btime (note y2106 limit) */
btime = ktime_get_real_seconds() - div_u64(delta, USEC_PER_SEC);
@@ -51,6 +56,7 @@ void bacct_add_tsk(struct user_namespace *user_ns,
stats->ac_nice = task_nice(tsk);
stats->ac_sched = tsk->policy;
stats->ac_pid = task_pid_nr_ns(tsk, pid_ns);
+ stats->ac_tgid = task_tgid_nr_ns(tsk, pid_ns);
rcu_read_lock();
tcred = __task_cred(tsk);
stats->ac_uid = from_kuid_munged(user_ns, tcred->uid);
diff --git a/kernel/umh.c b/kernel/umh.c
index 36c123360ab8..b989736e8707 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -132,7 +132,7 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
/* If SIGCLD is ignored do_wait won't populate the status. */
kernel_sigaction(SIGCHLD, SIG_DFL);
- pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
+ pid = user_mode_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
if (pid < 0)
sub_info->retval = pid;
else
@@ -171,8 +171,8 @@ static void call_usermodehelper_exec_work(struct work_struct *work)
* want to pollute current->children, and we need a parent
* that always ignores SIGCHLD to ensure auto-reaping.
*/
- pid = kernel_thread(call_usermodehelper_exec_async, sub_info,
- CLONE_PARENT | SIGCHLD);
+ pid = user_mode_thread(call_usermodehelper_exec_async, sub_info,
+ CLONE_PARENT | SIGCHLD);
if (pid < 0) {
sub_info->retval = pid;
umh_complete(sub_info);
diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
index 9dae1f648713..8303f4c7ca71 100644
--- a/kernel/usermode_driver.c
+++ b/kernel/usermode_driver.c
@@ -28,7 +28,7 @@ static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *na
file = file_open_root_mnt(mnt, name, O_CREAT | O_WRONLY, 0700);
if (IS_ERR(file)) {
- mntput(mnt);
+ kern_unmount(mnt);
return ERR_CAST(file);
}
@@ -38,7 +38,7 @@ static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *na
if (err >= 0)
err = -ENOMEM;
filp_close(file, NULL);
- mntput(mnt);
+ kern_unmount(mnt);
return ERR_PTR(err);
}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 9166220457bc..20a7a55e62b6 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -57,7 +57,7 @@ int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
* Should we panic when a soft-lockup or hard-lockup occurs:
*/
unsigned int __read_mostly hardlockup_panic =
- CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
+ IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC);
/*
* We may not want to enable hard lockup detection by default in all cases,
* for example when running the kernel as a guest on a hypervisor. In these
@@ -168,7 +168,7 @@ static struct cpumask watchdog_allowed_mask __read_mostly;
/* Global variables, exported for sysctl */
unsigned int __read_mostly softlockup_panic =
- CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
+ IS_ENABLED(CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC);
static bool softlockup_initialized __read_mostly;
static u64 __read_mostly sample_period;
@@ -424,6 +424,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
/* Start period for the next softlockup warning. */
update_report_ts();
+ printk_prefer_direct_enter();
+
pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
@@ -442,6 +444,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
if (softlockup_panic)
panic("softlockup: hung tasks");
+
+ printk_prefer_direct_exit();
}
return HRTIMER_RESTART;
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 247bf0b1582c..701f35f0e2d4 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -135,6 +135,8 @@ static void watchdog_overflow_callback(struct perf_event *event,
if (__this_cpu_read(hard_watchdog_warn) == true)
return;
+ printk_prefer_direct_enter();
+
pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n",
this_cpu);
print_modules();
@@ -155,6 +157,8 @@ static void watchdog_overflow_callback(struct perf_event *event,
if (hardlockup_panic)
nmi_panic(regs, "Hard LOCKUP");
+ printk_prefer_direct_exit();
+
__this_cpu_write(hard_watchdog_warn, true);
return;
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0d2514b4ff0d..4056f2a3f9d5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5001,7 +5001,7 @@ static void unbind_workers(int cpu)
for_each_pool_worker(worker, pool) {
kthread_set_per_cpu(worker->task, -1);
- WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
}
mutex_unlock(&wq_pool_attach_mutex);