summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile9
-rw-r--r--kernel/async.c4
-rw-r--r--kernel/audit.c41
-rw-r--r--kernel/audit_fsnotify.c22
-rw-r--r--kernel/audit_tree.c14
-rw-r--r--kernel/audit_watch.c19
-rw-r--r--kernel/auditsc.c45
-rw-r--r--kernel/backtracetest.c2
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/arraymap.c165
-rw-r--r--kernel/bpf/bpf_iter.c79
-rw-r--r--kernel/bpf/bpf_struct_ops.c3
-rw-r--r--kernel/bpf/btf.c177
-rw-r--r--kernel/bpf/cgroup.c82
-rw-r--r--kernel/bpf/core.c67
-rw-r--r--kernel/bpf/cpumap.c170
-rw-r--r--kernel/bpf/devmap.c6
-rw-r--r--kernel/bpf/hashtab.c217
-rw-r--r--kernel/bpf/local_storage.c219
-rw-r--r--kernel/bpf/lpm_trie.c8
-rw-r--r--kernel/bpf/map_iter.c104
-rw-r--r--kernel/bpf/net_namespace.c139
-rw-r--r--kernel/bpf/prog_iter.c107
-rw-r--r--kernel/bpf/queue_stack_maps.c13
-rw-r--r--kernel/bpf/reuseport_array.c5
-rw-r--r--kernel/bpf/ringbuf.c10
-rw-r--r--kernel/bpf/stackmap.c267
-rw-r--r--kernel/bpf/syscall.c66
-rw-r--r--kernel/bpf/task_iter.c32
-rw-r--r--kernel/bpf/verifier.c245
-rw-r--r--kernel/crash_core.c50
-rw-r--r--kernel/debug/debug_core.c2
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/dma/Kconfig20
-rw-r--r--kernel/dma/Makefile3
-rw-r--r--kernel/dma/contiguous.c31
-rw-r--r--kernel/dma/debug.c67
-rw-r--r--kernel/dma/direct.c74
-rw-r--r--kernel/dma/mapping.c214
-rw-r--r--kernel/entry/Makefile13
-rw-r--r--kernel/entry/common.c374
-rw-r--r--kernel/entry/kvm.c51
-rw-r--r--kernel/events/callchain.c18
-rw-r--r--kernel/events/core.c29
-rw-r--r--kernel/events/uprobes.c10
-rw-r--r--kernel/exit.c46
-rw-r--r--kernel/fork.c141
-rw-r--r--kernel/futex.c20
-rw-r--r--kernel/irq/Kconfig4
-rw-r--r--kernel/irq/chip.c16
-rw-r--r--kernel/irq/irqdomain.c3
-rw-r--r--kernel/irq/manage.c13
-rw-r--r--kernel/irq/pm.c8
-rw-r--r--kernel/irq/resend.c2
-rw-r--r--kernel/kcov.c6
-rw-r--r--kernel/kexec_file.c59
-rw-r--r--kernel/kmod.c5
-rw-r--r--kernel/kprobes.c24
-rw-r--r--kernel/kthread.c13
-rw-r--r--kernel/locking/lockdep.c16
-rw-r--r--kernel/locking/lockdep_proc.c2
-rw-r--r--kernel/locking/locktorture.c10
-rw-r--r--kernel/locking/qspinlock.c7
-rw-r--r--kernel/module.c84
-rw-r--r--kernel/nsproxy.c21
-rw-r--r--kernel/panic.c4
-rw-r--r--kernel/pid.c16
-rw-r--r--kernel/pid_namespace.c2
-rw-r--r--kernel/power/energy_model.c290
-rw-r--r--kernel/power/hibernate.c103
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/snapshot.c6
-rw-r--r--kernel/printk/printk.c16
-rw-r--r--kernel/rcu/rcuperf.c8
-rw-r--r--kernel/rcu/rcutorture.c7
-rw-r--r--kernel/rcu/tree.c2
-rw-r--r--kernel/regset.c76
-rw-r--r--kernel/sched/core.c65
-rw-r--r--kernel/sched/cpufreq_schedutil.c6
-rw-r--r--kernel/sched/fair.c2
-rw-r--r--kernel/sched/psi.c5
-rw-r--r--kernel/sched/sched.h2
-rw-r--r--kernel/sched/topology.c20
-rw-r--r--kernel/scs.c2
-rw-r--r--kernel/seccomp.c376
-rw-r--r--kernel/softirq.c18
-rw-r--r--kernel/stackleak.c16
-rw-r--r--kernel/stacktrace.c5
-rw-r--r--kernel/sys.c13
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c11
-rw-r--r--kernel/sysctl_binary.c171
-rw-r--r--kernel/time/Kconfig9
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/hrtimer.c13
-rw-r--r--kernel/time/namespace.c22
-rw-r--r--kernel/time/posix-cpu-timers.c216
-rw-r--r--kernel/time/sched_clock.c2
-rw-r--r--kernel/time/timekeeping.c21
-rw-r--r--kernel/time/timekeeping_internal.h11
-rw-r--r--kernel/time/timer.c254
-rw-r--r--kernel/time/vsyscall.c41
-rw-r--r--kernel/trace/Makefile6
-rw-r--r--kernel/trace/bpf_trace.c82
-rw-r--r--kernel/trace/bpf_trace.h34
-rw-r--r--kernel/trace/ftrace.c34
-rw-r--r--kernel/trace/ring_buffer.c696
-rw-r--r--kernel/trace/ring_buffer_benchmark.c48
-rw-r--r--kernel/trace/trace.c87
-rw-r--r--kernel/trace/trace.h9
-rw-r--r--kernel/trace/trace_events.c4
-rw-r--r--kernel/trace/trace_hwlat.c6
-rw-r--r--kernel/trace/trace_output.c14
-rw-r--r--kernel/trace/trace_uprobe.c1
-rw-r--r--kernel/umh.c200
-rw-r--r--kernel/usermode_driver.c182
116 files changed, 5099 insertions, 1937 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index f3218bc5ec69..9a20016d4900 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -5,13 +5,14 @@
obj-y = fork.o exec_domain.o panic.o \
cpu.o exit.o softirq.o resource.o \
- sysctl.o sysctl_binary.o capability.o ptrace.o user.o \
+ sysctl.o capability.o ptrace.o user.o \
signal.o sys.o umh.o workqueue.o pid.o task_work.o \
extable.o params.o \
kthread.o sys_ni.o nsproxy.o \
notifier.o ksysfs.o cred.o reboot.o \
- async.o range.o smpboot.o ucount.o
+ async.o range.o smpboot.o ucount.o regset.o
+obj-$(CONFIG_BPFILTER) += usermode_driver.o
obj-$(CONFIG_MODULES) += kmod.o
obj-$(CONFIG_MULTIUSER) += groups.o
@@ -35,7 +36,7 @@ KCOV_INSTRUMENT_stacktrace.o := n
KCOV_INSTRUMENT_kcov.o := n
KASAN_SANITIZE_kcov.o := n
KCSAN_SANITIZE_kcov.o := n
-CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector
# cond_syscall is currently not LTO compatible
CFLAGS_sys_ni.o = $(DISABLE_LTO)
@@ -48,6 +49,7 @@ obj-y += irq/
obj-y += rcu/
obj-y += livepatch/
obj-y += dma/
+obj-y += entry/
obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
@@ -125,6 +127,7 @@ obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o
obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o
+CFLAGS_stackleak.o += $(DISABLE_STACKLEAK_PLUGIN)
obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o
KASAN_SANITIZE_stackleak.o := n
KCSAN_SANITIZE_stackleak.o := n
diff --git a/kernel/async.c b/kernel/async.c
index 4f9c1d614016..33258e6e20f8 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -111,7 +111,7 @@ static void async_run_entry_fn(struct work_struct *work)
struct async_entry *entry =
container_of(work, struct async_entry, work);
unsigned long flags;
- ktime_t uninitialized_var(calltime), delta, rettime;
+ ktime_t calltime, delta, rettime;
/* 1) run (and print duration) */
if (initcall_debug && system_state < SYSTEM_RUNNING) {
@@ -287,7 +287,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
*/
void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
{
- ktime_t uninitialized_var(starttime), delta, endtime;
+ ktime_t starttime, delta, endtime;
if (initcall_debug && system_state < SYSTEM_RUNNING) {
pr_debug("async_waiting @ %i\n", task_pid_nr(current));
diff --git a/kernel/audit.c b/kernel/audit.c
index b2301bdc9773..7efaece534a9 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -136,6 +136,11 @@ u32 audit_sig_sid = 0;
*/
static atomic_t audit_lost = ATOMIC_INIT(0);
+/* Monotonically increasing sum of time the kernel has spent
+ * waiting while the backlog limit is exceeded.
+ */
+static atomic_t audit_backlog_wait_time_actual = ATOMIC_INIT(0);
+
/* Hash for inode-based rules */
struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -1201,17 +1206,18 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
case AUDIT_GET: {
struct audit_status s;
memset(&s, 0, sizeof(s));
- s.enabled = audit_enabled;
- s.failure = audit_failure;
+ s.enabled = audit_enabled;
+ s.failure = audit_failure;
/* NOTE: use pid_vnr() so the PID is relative to the current
* namespace */
- s.pid = auditd_pid_vnr();
- s.rate_limit = audit_rate_limit;
- s.backlog_limit = audit_backlog_limit;
- s.lost = atomic_read(&audit_lost);
- s.backlog = skb_queue_len(&audit_queue);
- s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL;
- s.backlog_wait_time = audit_backlog_wait_time;
+ s.pid = auditd_pid_vnr();
+ s.rate_limit = audit_rate_limit;
+ s.backlog_limit = audit_backlog_limit;
+ s.lost = atomic_read(&audit_lost);
+ s.backlog = skb_queue_len(&audit_queue);
+ s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL;
+ s.backlog_wait_time = audit_backlog_wait_time;
+ s.backlog_wait_time_actual = atomic_read(&audit_backlog_wait_time_actual);
audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
break;
}
@@ -1315,6 +1321,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
audit_log_config_change("lost", 0, lost, 1);
return lost;
}
+ if (s.mask == AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL) {
+ u32 actual = atomic_xchg(&audit_backlog_wait_time_actual, 0);
+
+ audit_log_config_change("backlog_wait_time_actual", 0, actual, 1);
+ return actual;
+ }
break;
}
case AUDIT_GET_FEATURE:
@@ -1800,7 +1812,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
{
struct audit_buffer *ab;
struct timespec64 t;
- unsigned int uninitialized_var(serial);
+ unsigned int serial;
if (audit_initialized != AUDIT_INITIALIZED)
return NULL;
@@ -1826,12 +1838,15 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
/* sleep if we are allowed and we haven't exhausted our
* backlog wait limit */
if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) {
+ long rtime = stime;
+
DECLARE_WAITQUEUE(wait, current);
add_wait_queue_exclusive(&audit_backlog_wait,
&wait);
set_current_state(TASK_UNINTERRUPTIBLE);
- stime = schedule_timeout(stime);
+ stime = schedule_timeout(rtime);
+ atomic_add(rtime - stime, &audit_backlog_wait_time_actual);
remove_wait_queue(&audit_backlog_wait, &wait);
} else {
if (audit_rate_check() && printk_ratelimit())
@@ -2079,13 +2094,13 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
/* We will allow 11 spaces for ' (deleted)' to be appended */
pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
if (!pathname) {
- audit_log_string(ab, "<no_memory>");
+ audit_log_format(ab, "\"<no_memory>\"");
return;
}
p = d_path(path, pathname, PATH_MAX+11);
if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
/* FIXME: can we save some information here? */
- audit_log_string(ab, "<too_long>");
+ audit_log_format(ab, "\"<too_long>\"");
} else
audit_log_untrustedstring(ab, p);
kfree(pathname);
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 3596448bfdab..bfcfcd61adb6 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -36,7 +36,7 @@ static struct fsnotify_group *audit_fsnotify_group;
/* fsnotify events we care about. */
#define AUDIT_FS_EVENTS (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
- FS_MOVE_SELF | FS_EVENT_ON_CHILD)
+ FS_MOVE_SELF)
static void audit_fsnotify_mark_free(struct audit_fsnotify_mark *audit_mark)
{
@@ -152,35 +152,31 @@ static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark)
}
/* Update mark data in audit rules based on fsnotify events. */
-static int audit_mark_handle_event(struct fsnotify_group *group,
- struct inode *to_tell,
- u32 mask, const void *data, int data_type,
- const struct qstr *dname, u32 cookie,
- struct fsnotify_iter_info *iter_info)
+static int audit_mark_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *dname)
{
- struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
struct audit_fsnotify_mark *audit_mark;
- const struct inode *inode = fsnotify_data_inode(data, data_type);
audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark);
- BUG_ON(group != audit_fsnotify_group);
-
- if (WARN_ON(!inode))
+ if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group) ||
+ WARN_ON_ONCE(!inode))
return 0;
if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) {
if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL))
return 0;
audit_update_mark(audit_mark, inode);
- } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
+ } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) {
audit_autoremove_mark_rule(audit_mark);
+ }
return 0;
}
static const struct fsnotify_ops audit_mark_fsnotify_ops = {
- .handle_event = audit_mark_handle_event,
+ .handle_inode_event = audit_mark_handle_event,
.free_mark = audit_fsnotify_free_mark,
};
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index e49c912f862d..83e1c07fc99e 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -188,11 +188,9 @@ static struct fsnotify_mark *alloc_mark(void)
static struct audit_chunk *alloc_chunk(int count)
{
struct audit_chunk *chunk;
- size_t size;
int i;
- size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
- chunk = kzalloc(size, GFP_KERNEL);
+ chunk = kzalloc(struct_size(chunk, owners, count), GFP_KERNEL);
if (!chunk)
return NULL;
@@ -1037,11 +1035,9 @@ static void evict_chunk(struct audit_chunk *chunk)
audit_schedule_prune();
}
-static int audit_tree_handle_event(struct fsnotify_group *group,
- struct inode *to_tell,
- u32 mask, const void *data, int data_type,
- const struct qstr *file_name, u32 cookie,
- struct fsnotify_iter_info *iter_info)
+static int audit_tree_handle_event(struct fsnotify_mark *mark, u32 mask,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *file_name)
{
return 0;
}
@@ -1070,7 +1066,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *mark,
}
static const struct fsnotify_ops audit_tree_ops = {
- .handle_event = audit_tree_handle_event,
+ .handle_inode_event = audit_tree_handle_event,
.freeing_mark = audit_tree_freeing_mark,
.free_mark = audit_tree_destroy_watch,
};
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index e09c551ae52d..246e5ba704c0 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -53,7 +53,7 @@ static struct fsnotify_group *audit_watch_group;
/* fsnotify events we care about. */
#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
- FS_MOVE_SELF | FS_EVENT_ON_CHILD | FS_UNMOUNT)
+ FS_MOVE_SELF | FS_UNMOUNT)
static void audit_free_parent(struct audit_parent *parent)
{
@@ -464,20 +464,17 @@ void audit_remove_watch_rule(struct audit_krule *krule)
}
/* Update watch data in audit rules based on fsnotify events. */
-static int audit_watch_handle_event(struct fsnotify_group *group,
- struct inode *to_tell,
- u32 mask, const void *data, int data_type,
- const struct qstr *dname, u32 cookie,
- struct fsnotify_iter_info *iter_info)
+static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *dname)
{
- struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
- const struct inode *inode = fsnotify_data_inode(data, data_type);
struct audit_parent *parent;
parent = container_of(inode_mark, struct audit_parent, mark);
- BUG_ON(group != audit_watch_group);
- WARN_ON(!inode);
+ if (WARN_ON_ONCE(inode_mark->group != audit_watch_group) ||
+ WARN_ON_ONCE(!inode))
+ return 0;
if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
@@ -490,7 +487,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
}
static const struct fsnotify_ops audit_watch_fsnotify_ops = {
- .handle_event = audit_watch_handle_event,
+ .handle_inode_event = audit_watch_handle_event,
.free_mark = audit_watch_free_mark,
};
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fd840c40abf7..8dba8f0983b5 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -75,6 +75,7 @@
#include <linux/uaccess.h>
#include <linux/fsnotify_backend.h>
#include <uapi/linux/limits.h>
+#include <uapi/linux/netfilter/nf_tables.h>
#include "audit.h"
@@ -136,9 +137,26 @@ struct audit_nfcfgop_tab {
};
static const struct audit_nfcfgop_tab audit_nfcfgs[] = {
- { AUDIT_XT_OP_REGISTER, "register" },
- { AUDIT_XT_OP_REPLACE, "replace" },
- { AUDIT_XT_OP_UNREGISTER, "unregister" },
+ { AUDIT_XT_OP_REGISTER, "xt_register" },
+ { AUDIT_XT_OP_REPLACE, "xt_replace" },
+ { AUDIT_XT_OP_UNREGISTER, "xt_unregister" },
+ { AUDIT_NFT_OP_TABLE_REGISTER, "nft_register_table" },
+ { AUDIT_NFT_OP_TABLE_UNREGISTER, "nft_unregister_table" },
+ { AUDIT_NFT_OP_CHAIN_REGISTER, "nft_register_chain" },
+ { AUDIT_NFT_OP_CHAIN_UNREGISTER, "nft_unregister_chain" },
+ { AUDIT_NFT_OP_RULE_REGISTER, "nft_register_rule" },
+ { AUDIT_NFT_OP_RULE_UNREGISTER, "nft_unregister_rule" },
+ { AUDIT_NFT_OP_SET_REGISTER, "nft_register_set" },
+ { AUDIT_NFT_OP_SET_UNREGISTER, "nft_unregister_set" },
+ { AUDIT_NFT_OP_SETELEM_REGISTER, "nft_register_setelem" },
+ { AUDIT_NFT_OP_SETELEM_UNREGISTER, "nft_unregister_setelem" },
+ { AUDIT_NFT_OP_GEN_REGISTER, "nft_register_gen" },
+ { AUDIT_NFT_OP_OBJ_REGISTER, "nft_register_obj" },
+ { AUDIT_NFT_OP_OBJ_UNREGISTER, "nft_unregister_obj" },
+ { AUDIT_NFT_OP_OBJ_RESET, "nft_reset_obj" },
+ { AUDIT_NFT_OP_FLOWTABLE_REGISTER, "nft_register_flowtable" },
+ { AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, "nft_unregister_flowtable" },
+ { AUDIT_NFT_OP_INVALID, "nft_invalid" },
};
static int audit_match_perm(struct audit_context *ctx, int mask)
@@ -1876,6 +1894,20 @@ __audit_reusename(const __user char *uptr)
return NULL;
}
+inline void _audit_getcwd(struct audit_context *context)
+{
+ if (!context->pwd.dentry)
+ get_fs_pwd(current->fs, &context->pwd);
+}
+
+void __audit_getcwd(void)
+{
+ struct audit_context *context = audit_context();
+
+ if (context->in_syscall)
+ _audit_getcwd(context);
+}
+
/**
* __audit_getname - add a name to the list
* @name: name to add
@@ -1900,8 +1932,7 @@ void __audit_getname(struct filename *name)
name->aname = n;
name->refcnt++;
- if (!context->pwd.dentry)
- get_fs_pwd(current->fs, &context->pwd);
+ _audit_getcwd(context);
}
static inline int audit_copy_fcaps(struct audit_names *name,
@@ -2557,12 +2588,12 @@ void __audit_ntp_log(const struct audit_ntp_data *ad)
}
void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries,
- enum audit_nfcfgop op)
+ enum audit_nfcfgop op, gfp_t gfp)
{
struct audit_buffer *ab;
char comm[sizeof(current->comm)];
- ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_NETFILTER_CFG);
+ ab = audit_log_start(audit_context(), gfp, AUDIT_NETFILTER_CFG);
if (!ab)
return;
audit_log_format(ab, "table=%s family=%u entries=%u op=%s",
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index a2a97fa3071b..370217dd7e39 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -29,7 +29,7 @@ static void backtrace_test_irq_callback(unsigned long data)
complete(&backtrace_work);
}
-static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0);
+static DECLARE_TASKLET_OLD(backtrace_tasklet, &backtrace_test_irq_callback);
static void backtrace_test_irq(void)
{
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 1131a921e1a6..e6eb9c0402da 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,7 +2,7 @@
obj-y := core.o
CFLAGS_core.o += $(call cc-disable-warning, override-init)
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 11584618e861..8ff419b632a6 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -386,13 +386,6 @@ static void array_map_free(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
- /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
- * so the programs (can be more than one that used this map) were
- * disconnected from events. Wait for outstanding programs to complete
- * and free the array
- */
- synchronize_rcu();
-
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
bpf_array_free_percpu(array);
@@ -494,6 +487,143 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
vma->vm_pgoff + pgoff);
}
+struct bpf_iter_seq_array_map_info {
+ struct bpf_map *map;
+ void *percpu_value_buf;
+ u32 index;
+};
+
+static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_array_map_info *info = seq->private;
+ struct bpf_map *map = info->map;
+ struct bpf_array *array;
+ u32 index;
+
+ if (info->index >= map->max_entries)
+ return NULL;
+
+ if (*pos == 0)
+ ++*pos;
+ array = container_of(map, struct bpf_array, map);
+ index = info->index & array->index_mask;
+ if (info->percpu_value_buf)
+ return array->pptrs[index];
+ return array->value + array->elem_size * index;
+}
+
+static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_array_map_info *info = seq->private;
+ struct bpf_map *map = info->map;
+ struct bpf_array *array;
+ u32 index;
+
+ ++*pos;
+ ++info->index;
+ if (info->index >= map->max_entries)
+ return NULL;
+
+ array = container_of(map, struct bpf_array, map);
+ index = info->index & array->index_mask;
+ if (info->percpu_value_buf)
+ return array->pptrs[index];
+ return array->value + array->elem_size * index;
+}
+
+static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_seq_array_map_info *info = seq->private;
+ struct bpf_iter__bpf_map_elem ctx = {};
+ struct bpf_map *map = info->map;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int off = 0, cpu = 0;
+ void __percpu **pptr;
+ u32 size;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, v == NULL);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.map = info->map;
+ if (v) {
+ ctx.key = &info->index;
+
+ if (!info->percpu_value_buf) {
+ ctx.value = v;
+ } else {
+ pptr = v;
+ size = round_up(map->value_size, 8);
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(info->percpu_value_buf + off,
+ per_cpu_ptr(pptr, cpu),
+ size);
+ off += size;
+ }
+ ctx.value = info->percpu_value_buf;
+ }
+ }
+
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int bpf_array_map_seq_show(struct seq_file *seq, void *v)
+{
+ return __bpf_array_map_seq_show(seq, v);
+}
+
+static void bpf_array_map_seq_stop(struct seq_file *seq, void *v)
+{
+ if (!v)
+ (void)__bpf_array_map_seq_show(seq, NULL);
+}
+
+static int bpf_iter_init_array_map(void *priv_data,
+ struct bpf_iter_aux_info *aux)
+{
+ struct bpf_iter_seq_array_map_info *seq_info = priv_data;
+ struct bpf_map *map = aux->map;
+ void *value_buf;
+ u32 buf_size;
+
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ buf_size = round_up(map->value_size, 8) * num_possible_cpus();
+ value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
+ if (!value_buf)
+ return -ENOMEM;
+
+ seq_info->percpu_value_buf = value_buf;
+ }
+
+ seq_info->map = map;
+ return 0;
+}
+
+static void bpf_iter_fini_array_map(void *priv_data)
+{
+ struct bpf_iter_seq_array_map_info *seq_info = priv_data;
+
+ kfree(seq_info->percpu_value_buf);
+}
+
+static const struct seq_operations bpf_array_map_seq_ops = {
+ .start = bpf_array_map_seq_start,
+ .next = bpf_array_map_seq_next,
+ .stop = bpf_array_map_seq_stop,
+ .show = bpf_array_map_seq_show,
+};
+
+static const struct bpf_iter_seq_info iter_seq_info = {
+ .seq_ops = &bpf_array_map_seq_ops,
+ .init_seq_private = bpf_iter_init_array_map,
+ .fini_seq_private = bpf_iter_fini_array_map,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info),
+};
+
+static int array_map_btf_id;
const struct bpf_map_ops array_map_ops = {
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
@@ -510,8 +640,12 @@ const struct bpf_map_ops array_map_ops = {
.map_check_btf = array_map_check_btf,
.map_lookup_batch = generic_map_lookup_batch,
.map_update_batch = generic_map_update_batch,
+ .map_btf_name = "bpf_array",
+ .map_btf_id = &array_map_btf_id,
+ .iter_seq_info = &iter_seq_info,
};
+static int percpu_array_map_btf_id;
const struct bpf_map_ops percpu_array_map_ops = {
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
@@ -522,6 +656,9 @@ const struct bpf_map_ops percpu_array_map_ops = {
.map_delete_elem = array_map_delete_elem,
.map_seq_show_elem = percpu_array_map_seq_show_elem,
.map_check_btf = array_map_check_btf,
+ .map_btf_name = "bpf_array",
+ .map_btf_id = &percpu_array_map_btf_id,
+ .iter_seq_info = &iter_seq_info,
};
static int fd_array_map_alloc_check(union bpf_attr *attr)
@@ -540,8 +677,6 @@ static void fd_array_map_free(struct bpf_map *map)
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
- synchronize_rcu();
-
/* make sure it's empty */
for (i = 0; i < array->map.max_entries; i++)
BUG_ON(array->ptrs[i] != NULL);
@@ -868,6 +1003,7 @@ static void prog_array_map_free(struct bpf_map *map)
fd_array_map_free(map);
}
+static int prog_array_map_btf_id;
const struct bpf_map_ops prog_array_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = prog_array_map_alloc,
@@ -883,6 +1019,8 @@ const struct bpf_map_ops prog_array_map_ops = {
.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
.map_release_uref = prog_array_map_clear,
.map_seq_show_elem = prog_array_map_seq_show_elem,
+ .map_btf_name = "bpf_array",
+ .map_btf_id = &prog_array_map_btf_id,
};
static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
@@ -961,6 +1099,7 @@ static void perf_event_fd_array_release(struct bpf_map *map,
rcu_read_unlock();
}
+static int perf_event_array_map_btf_id;
const struct bpf_map_ops perf_event_array_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_map_alloc,
@@ -972,6 +1111,8 @@ const struct bpf_map_ops perf_event_array_map_ops = {
.map_fd_put_ptr = perf_event_fd_array_put_ptr,
.map_release = perf_event_fd_array_release,
.map_check_btf = map_check_no_btf,
+ .map_btf_name = "bpf_array",
+ .map_btf_id = &perf_event_array_map_btf_id,
};
#ifdef CONFIG_CGROUPS
@@ -994,6 +1135,7 @@ static void cgroup_fd_array_free(struct bpf_map *map)
fd_array_map_free(map);
}
+static int cgroup_array_map_btf_id;
const struct bpf_map_ops cgroup_array_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_map_alloc,
@@ -1004,6 +1146,8 @@ const struct bpf_map_ops cgroup_array_map_ops = {
.map_fd_get_ptr = cgroup_fd_array_get_ptr,
.map_fd_put_ptr = cgroup_fd_array_put_ptr,
.map_check_btf = map_check_no_btf,
+ .map_btf_name = "bpf_array",
+ .map_btf_id = &cgroup_array_map_btf_id,
};
#endif
@@ -1077,6 +1221,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map,
return insn - insn_buf;
}
+static int array_of_maps_map_btf_id;
const struct bpf_map_ops array_of_maps_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_of_map_alloc,
@@ -1089,4 +1234,6 @@ const struct bpf_map_ops array_of_maps_map_ops = {
.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
.map_gen_lookup = array_of_map_gen_lookup,
.map_check_btf = map_check_no_btf,
+ .map_btf_name = "bpf_array",
+ .map_btf_id = &array_of_maps_map_btf_id,
};
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index dd612b80b9fe..b6715964b685 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -14,11 +14,13 @@ struct bpf_iter_target_info {
struct bpf_iter_link {
struct bpf_link link;
+ struct bpf_iter_aux_info aux;
struct bpf_iter_target_info *tinfo;
};
struct bpf_iter_priv_data {
struct bpf_iter_target_info *tinfo;
+ const struct bpf_iter_seq_info *seq_info;
struct bpf_prog *prog;
u64 session_id;
u64 seq_num;
@@ -35,7 +37,8 @@ static DEFINE_MUTEX(link_mutex);
/* incremented on every opened seq_file */
static atomic64_t session_id;
-static int prepare_seq_file(struct file *file, struct bpf_iter_link *link);
+static int prepare_seq_file(struct file *file, struct bpf_iter_link *link,
+ const struct bpf_iter_seq_info *seq_info);
static void bpf_iter_inc_seq_num(struct seq_file *seq)
{
@@ -199,11 +202,25 @@ done:
return copied;
}
+static const struct bpf_iter_seq_info *
+__get_seq_info(struct bpf_iter_link *link)
+{
+ const struct bpf_iter_seq_info *seq_info;
+
+ if (link->aux.map) {
+ seq_info = link->aux.map->ops->iter_seq_info;
+ if (seq_info)
+ return seq_info;
+ }
+
+ return link->tinfo->reg_info->seq_info;
+}
+
static int iter_open(struct inode *inode, struct file *file)
{
struct bpf_iter_link *link = inode->i_private;
- return prepare_seq_file(file, link);
+ return prepare_seq_file(file, link, __get_seq_info(link));
}
static int iter_release(struct inode *inode, struct file *file)
@@ -218,8 +235,8 @@ static int iter_release(struct inode *inode, struct file *file)
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
target_private);
- if (iter_priv->tinfo->reg_info->fini_seq_private)
- iter_priv->tinfo->reg_info->fini_seq_private(seq->private);
+ if (iter_priv->seq_info->fini_seq_private)
+ iter_priv->seq_info->fini_seq_private(seq->private);
bpf_prog_put(iter_priv->prog);
seq->private = iter_priv;
@@ -318,6 +335,11 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog)
static void bpf_iter_link_release(struct bpf_link *link)
{
+ struct bpf_iter_link *iter_link =
+ container_of(link, struct bpf_iter_link, link);
+
+ if (iter_link->tinfo->reg_info->detach_target)
+ iter_link->tinfo->reg_info->detach_target(&iter_link->aux);
}
static void bpf_iter_link_dealloc(struct bpf_link *link)
@@ -368,16 +390,35 @@ bool bpf_link_is_iter(struct bpf_link *link)
int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
+ union bpf_iter_link_info __user *ulinfo;
struct bpf_link_primer link_primer;
struct bpf_iter_target_info *tinfo;
+ union bpf_iter_link_info linfo;
struct bpf_iter_link *link;
+ u32 prog_btf_id, linfo_len;
bool existed = false;
- u32 prog_btf_id;
int err;
if (attr->link_create.target_fd || attr->link_create.flags)
return -EINVAL;
+ memset(&linfo, 0, sizeof(union bpf_iter_link_info));
+
+ ulinfo = u64_to_user_ptr(attr->link_create.iter_info);
+ linfo_len = attr->link_create.iter_info_len;
+ if (!ulinfo ^ !linfo_len)
+ return -EINVAL;
+
+ if (ulinfo) {
+ err = bpf_check_uarg_tail_zero(ulinfo, sizeof(linfo),
+ linfo_len);
+ if (err)
+ return err;
+ linfo_len = min_t(u32, linfo_len, sizeof(linfo));
+ if (copy_from_user(&linfo, ulinfo, linfo_len))
+ return -EFAULT;
+ }
+
prog_btf_id = prog->aux->attach_btf_id;
mutex_lock(&targets_mutex);
list_for_each_entry(tinfo, &targets, list) {
@@ -403,21 +444,32 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
return err;
}
+ if (tinfo->reg_info->attach_target) {
+ err = tinfo->reg_info->attach_target(prog, &linfo, &link->aux);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ return err;
+ }
+ }
+
return bpf_link_settle(&link_primer);
}
static void init_seq_meta(struct bpf_iter_priv_data *priv_data,
struct bpf_iter_target_info *tinfo,
+ const struct bpf_iter_seq_info *seq_info,
struct bpf_prog *prog)
{
priv_data->tinfo = tinfo;
+ priv_data->seq_info = seq_info;
priv_data->prog = prog;
priv_data->session_id = atomic64_inc_return(&session_id);
priv_data->seq_num = 0;
priv_data->done_stop = false;
}
-static int prepare_seq_file(struct file *file, struct bpf_iter_link *link)
+static int prepare_seq_file(struct file *file, struct bpf_iter_link *link,
+ const struct bpf_iter_seq_info *seq_info)
{
struct bpf_iter_priv_data *priv_data;
struct bpf_iter_target_info *tinfo;
@@ -433,21 +485,21 @@ static int prepare_seq_file(struct file *file, struct bpf_iter_link *link)
tinfo = link->tinfo;
total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) +
- tinfo->reg_info->seq_priv_size;
- priv_data = __seq_open_private(file, tinfo->reg_info->seq_ops,
+ seq_info->seq_priv_size;
+ priv_data = __seq_open_private(file, seq_info->seq_ops,
total_priv_dsize);
if (!priv_data) {
err = -ENOMEM;
goto release_prog;
}
- if (tinfo->reg_info->init_seq_private) {
- err = tinfo->reg_info->init_seq_private(priv_data->target_private);
+ if (seq_info->init_seq_private) {
+ err = seq_info->init_seq_private(priv_data->target_private, &link->aux);
if (err)
goto release_seq_file;
}
- init_seq_meta(priv_data, tinfo, prog);
+ init_seq_meta(priv_data, tinfo, seq_info, prog);
seq = file->private_data;
seq->private = priv_data->target_private;
@@ -463,6 +515,7 @@ release_prog:
int bpf_iter_new_fd(struct bpf_link *link)
{
+ struct bpf_iter_link *iter_link;
struct file *file;
unsigned int flags;
int err, fd;
@@ -481,8 +534,8 @@ int bpf_iter_new_fd(struct bpf_link *link)
goto free_fd;
}
- err = prepare_seq_file(file,
- container_of(link, struct bpf_iter_link, link));
+ iter_link = container_of(link, struct bpf_iter_link, link);
+ err = prepare_seq_file(file, iter_link, __get_seq_info(iter_link));
if (err)
goto free_file;
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index c6b0decaa46a..969c5d47f81f 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -611,6 +611,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
return map;
}
+static int bpf_struct_ops_map_btf_id;
const struct bpf_map_ops bpf_struct_ops_map_ops = {
.map_alloc_check = bpf_struct_ops_map_alloc_check,
.map_alloc = bpf_struct_ops_map_alloc,
@@ -620,6 +621,8 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
.map_delete_elem = bpf_struct_ops_map_delete_elem,
.map_update_elem = bpf_struct_ops_map_update_elem,
.map_seq_show_elem = bpf_struct_ops_map_seq_show_elem,
+ .map_btf_name = "bpf_struct_ops_map",
+ .map_btf_id = &bpf_struct_ops_map_btf_id,
};
/* "const void *" because some subsystem is
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 0443600146dc..91afdd4c82e3 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -18,6 +18,7 @@
#include <linux/sort.h>
#include <linux/bpf_verifier.h>
#include <linux/btf.h>
+#include <linux/btf_ids.h>
#include <linux/skmsg.h>
#include <linux/perf_event.h>
#include <net/sock.h>
@@ -3571,6 +3572,41 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
return ctx_type;
}
+static const struct bpf_map_ops * const btf_vmlinux_map_ops[] = {
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
+#define BPF_LINK_TYPE(_id, _name)
+#define BPF_MAP_TYPE(_id, _ops) \
+ [_id] = &_ops,
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_LINK_TYPE
+#undef BPF_MAP_TYPE
+};
+
+static int btf_vmlinux_map_ids_init(const struct btf *btf,
+ struct bpf_verifier_log *log)
+{
+ const struct bpf_map_ops *ops;
+ int i, btf_id;
+
+ for (i = 0; i < ARRAY_SIZE(btf_vmlinux_map_ops); ++i) {
+ ops = btf_vmlinux_map_ops[i];
+ if (!ops || (!ops->map_btf_name && !ops->map_btf_id))
+ continue;
+ if (!ops->map_btf_name || !ops->map_btf_id) {
+ bpf_log(log, "map type %d is misconfigured\n", i);
+ return -EINVAL;
+ }
+ btf_id = btf_find_by_name_kind(btf, ops->map_btf_name,
+ BTF_KIND_STRUCT);
+ if (btf_id < 0)
+ return btf_id;
+ *ops->map_btf_id = btf_id;
+ }
+
+ return 0;
+}
+
static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
struct btf *btf,
const struct btf_type *t,
@@ -3586,12 +3622,15 @@ static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
return kern_ctx_type->type;
}
+BTF_ID_LIST(bpf_ctx_convert_btf_id)
+BTF_ID(struct, bpf_ctx_convert)
+
struct btf *btf_parse_vmlinux(void)
{
struct btf_verifier_env *env = NULL;
struct bpf_verifier_log *log;
struct btf *btf = NULL;
- int err, i;
+ int err;
env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
if (!env)
@@ -3624,25 +3663,13 @@ struct btf *btf_parse_vmlinux(void)
if (err)
goto errout;
- /* find struct bpf_ctx_convert for type checking later */
- for (i = 1; i <= btf->nr_types; i++) {
- const struct btf_type *t;
- const char *tname;
+ /* btf_parse_vmlinux() runs under bpf_verifier_lock */
+ bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
- t = btf_type_by_id(btf, i);
- if (!__btf_type_is_struct(t))
- continue;
- tname = __btf_name_by_offset(btf, t->name_off);
- if (!strcmp(tname, "bpf_ctx_convert")) {
- /* btf_parse_vmlinux() runs under bpf_verifier_lock */
- bpf_ctx_convert.t = t;
- break;
- }
- }
- if (i > btf->nr_types) {
- err = -ENOENT;
+ /* find bpf map structs for map_ptr access checking */
+ err = btf_vmlinux_map_ids_init(btf, log);
+ if (err < 0)
goto errout;
- }
bpf_struct_ops_init(btf, log);
@@ -3779,6 +3806,19 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
btf_kind_str[BTF_INFO_KIND(t->info)]);
return false;
}
+
+ /* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */
+ for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
+ const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
+
+ if (ctx_arg_info->offset == off &&
+ (ctx_arg_info->reg_type == PTR_TO_RDONLY_BUF_OR_NULL ||
+ ctx_arg_info->reg_type == PTR_TO_RDWR_BUF_OR_NULL)) {
+ info->reg_type = ctx_arg_info->reg_type;
+ return true;
+ }
+ }
+
if (t->type == 0)
/* This is a pointer to void.
* It is the same as scalar from the verifier safety pov.
@@ -3790,16 +3830,17 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
return true;
/* this is a pointer to another type */
- info->reg_type = PTR_TO_BTF_ID;
for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
if (ctx_arg_info->offset == off) {
info->reg_type = ctx_arg_info->reg_type;
- break;
+ info->btf_id = ctx_arg_info->btf_id;
+ return true;
}
}
+ info->reg_type = PTR_TO_BTF_ID;
if (tgt_prog) {
ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg);
if (ret > 0) {
@@ -4049,101 +4090,17 @@ error:
return -EINVAL;
}
-static int __btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn,
- int arg)
-{
- char fnname[KSYM_SYMBOL_LEN + 4] = "btf_";
- const struct btf_param *args;
- const struct btf_type *t;
- const char *tname, *sym;
- u32 btf_id, i;
-
- if (!btf_vmlinux) {
- bpf_log(log, "btf_vmlinux doesn't exist\n");
- return -EINVAL;
- }
-
- if (IS_ERR(btf_vmlinux)) {
- bpf_log(log, "btf_vmlinux is malformed\n");
- return -EINVAL;
- }
-
- sym = kallsyms_lookup((long)fn, NULL, NULL, NULL, fnname + 4);
- if (!sym) {
- bpf_log(log, "kernel doesn't have kallsyms\n");
- return -EFAULT;
- }
-
- for (i = 1; i <= btf_vmlinux->nr_types; i++) {
- t = btf_type_by_id(btf_vmlinux, i);
- if (BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF)
- continue;
- tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
- if (!strcmp(tname, fnname))
- break;
- }
- if (i > btf_vmlinux->nr_types) {
- bpf_log(log, "helper %s type is not found\n", fnname);
- return -ENOENT;
- }
-
- t = btf_type_by_id(btf_vmlinux, t->type);
- if (!btf_type_is_ptr(t))
- return -EFAULT;
- t = btf_type_by_id(btf_vmlinux, t->type);
- if (!btf_type_is_func_proto(t))
- return -EFAULT;
-
- args = (const struct btf_param *)(t + 1);
- if (arg >= btf_type_vlen(t)) {
- bpf_log(log, "bpf helper %s doesn't have %d-th argument\n",
- fnname, arg);
- return -EINVAL;
- }
-
- t = btf_type_by_id(btf_vmlinux, args[arg].type);
- if (!btf_type_is_ptr(t) || !t->type) {
- /* anything but the pointer to struct is a helper config bug */
- bpf_log(log, "ARG_PTR_TO_BTF is misconfigured\n");
- return -EFAULT;
- }
- btf_id = t->type;
- t = btf_type_by_id(btf_vmlinux, t->type);
- /* skip modifiers */
- while (btf_type_is_modifier(t)) {
- btf_id = t->type;
- t = btf_type_by_id(btf_vmlinux, t->type);
- }
- if (!btf_type_is_struct(t)) {
- bpf_log(log, "ARG_PTR_TO_BTF is not a struct\n");
- return -EFAULT;
- }
- bpf_log(log, "helper %s arg%d has btf_id %d struct %s\n", fnname + 4,
- arg, btf_id, __btf_name_by_offset(btf_vmlinux, t->name_off));
- return btf_id;
-}
-
int btf_resolve_helper_id(struct bpf_verifier_log *log,
const struct bpf_func_proto *fn, int arg)
{
- int *btf_id = &fn->btf_id[arg];
- int ret;
+ int id;
- if (fn->arg_type[arg] != ARG_PTR_TO_BTF_ID)
+ if (fn->arg_type[arg] != ARG_PTR_TO_BTF_ID || !btf_vmlinux)
return -EINVAL;
-
- ret = READ_ONCE(*btf_id);
- if (ret)
- return ret;
- /* ok to race the search. The result is the same */
- ret = __btf_resolve_helper_id(log, fn->func, arg);
- if (!ret) {
- /* Function argument cannot be type 'void' */
- bpf_log(log, "BTF resolution bug\n");
- return -EFAULT;
- }
- WRITE_ONCE(*btf_id, ret);
- return ret;
+ id = fn->btf_id[arg];
+ if (!id || id > btf_vmlinux->nr_types)
+ return -EINVAL;
+ return id;
}
static int __get_type_size(struct btf *btf, u32 btf_id,
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ac53102e244a..83ff127ef7ae 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -37,17 +37,34 @@ static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[])
}
static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[],
- struct bpf_prog *prog)
+ struct bpf_cgroup_storage *new_storages[],
+ enum bpf_attach_type type,
+ struct bpf_prog *prog,
+ struct cgroup *cgrp)
{
enum bpf_cgroup_storage_type stype;
+ struct bpf_cgroup_storage_key key;
+ struct bpf_map *map;
+
+ key.cgroup_inode_id = cgroup_id(cgrp);
+ key.attach_type = type;
for_each_cgroup_storage_type(stype) {
+ map = prog->aux->cgroup_storage[stype];
+ if (!map)
+ continue;
+
+ storages[stype] = cgroup_storage_lookup((void *)map, &key, false);
+ if (storages[stype])
+ continue;
+
storages[stype] = bpf_cgroup_storage_alloc(prog, stype);
if (IS_ERR(storages[stype])) {
- storages[stype] = NULL;
- bpf_cgroup_storages_free(storages);
+ bpf_cgroup_storages_free(new_storages);
return -ENOMEM;
}
+
+ new_storages[stype] = storages[stype];
}
return 0;
@@ -63,7 +80,7 @@ static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[],
}
static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
- struct cgroup* cgrp,
+ struct cgroup *cgrp,
enum bpf_attach_type attach_type)
{
enum bpf_cgroup_storage_type stype;
@@ -72,14 +89,6 @@ static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
bpf_cgroup_storage_link(storages[stype], cgrp, attach_type);
}
-static void bpf_cgroup_storages_unlink(struct bpf_cgroup_storage *storages[])
-{
- enum bpf_cgroup_storage_type stype;
-
- for_each_cgroup_storage_type(stype)
- bpf_cgroup_storage_unlink(storages[stype]);
-}
-
/* Called when bpf_cgroup_link is auto-detached from dying cgroup.
* It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It
* doesn't free link memory, which will eventually be done by bpf_link's
@@ -101,22 +110,23 @@ static void cgroup_bpf_release(struct work_struct *work)
struct cgroup *p, *cgrp = container_of(work, struct cgroup,
bpf.release_work);
struct bpf_prog_array *old_array;
+ struct list_head *storages = &cgrp->bpf.storages;
+ struct bpf_cgroup_storage *storage, *stmp;
+
unsigned int type;
mutex_lock(&cgroup_mutex);
for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
struct list_head *progs = &cgrp->bpf.progs[type];
- struct bpf_prog_list *pl, *tmp;
+ struct bpf_prog_list *pl, *pltmp;
- list_for_each_entry_safe(pl, tmp, progs, node) {
+ list_for_each_entry_safe(pl, pltmp, progs, node) {
list_del(&pl->node);
if (pl->prog)
bpf_prog_put(pl->prog);
if (pl->link)
bpf_cgroup_link_auto_detach(pl->link);
- bpf_cgroup_storages_unlink(pl->storage);
- bpf_cgroup_storages_free(pl->storage);
kfree(pl);
static_branch_dec(&cgroup_bpf_enabled_key);
}
@@ -126,6 +136,11 @@ static void cgroup_bpf_release(struct work_struct *work)
bpf_prog_array_free(old_array);
}
+ list_for_each_entry_safe(storage, stmp, storages, list_cg) {
+ bpf_cgroup_storage_unlink(storage);
+ bpf_cgroup_storage_free(storage);
+ }
+
mutex_unlock(&cgroup_mutex);
for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
@@ -290,6 +305,8 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
for (i = 0; i < NR; i++)
INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
+ INIT_LIST_HEAD(&cgrp->bpf.storages);
+
for (i = 0; i < NR; i++)
if (compute_effective_progs(cgrp, i, &arrays[i]))
goto cleanup;
@@ -422,7 +439,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
struct list_head *progs = &cgrp->bpf.progs[type];
struct bpf_prog *old_prog = NULL;
struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
- struct bpf_cgroup_storage *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
+ struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
struct bpf_prog_list *pl;
int err;
@@ -455,17 +472,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
if (IS_ERR(pl))
return PTR_ERR(pl);
- if (bpf_cgroup_storages_alloc(storage, prog ? : link->link.prog))
+ if (bpf_cgroup_storages_alloc(storage, new_storage, type,
+ prog ? : link->link.prog, cgrp))
return -ENOMEM;
if (pl) {
old_prog = pl->prog;
- bpf_cgroup_storages_unlink(pl->storage);
- bpf_cgroup_storages_assign(old_storage, pl->storage);
} else {
pl = kmalloc(sizeof(*pl), GFP_KERNEL);
if (!pl) {
- bpf_cgroup_storages_free(storage);
+ bpf_cgroup_storages_free(new_storage);
return -ENOMEM;
}
list_add_tail(&pl->node, progs);
@@ -480,12 +496,11 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
if (err)
goto cleanup;
- bpf_cgroup_storages_free(old_storage);
if (old_prog)
bpf_prog_put(old_prog);
else
static_branch_inc(&cgroup_bpf_enabled_key);
- bpf_cgroup_storages_link(pl->storage, cgrp, type);
+ bpf_cgroup_storages_link(new_storage, cgrp, type);
return 0;
cleanup:
@@ -493,9 +508,7 @@ cleanup:
pl->prog = old_prog;
pl->link = NULL;
}
- bpf_cgroup_storages_free(pl->storage);
- bpf_cgroup_storages_assign(pl->storage, old_storage);
- bpf_cgroup_storages_link(pl->storage, cgrp, type);
+ bpf_cgroup_storages_free(new_storage);
if (!old_prog) {
list_del(&pl->node);
kfree(pl);
@@ -679,8 +692,6 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
/* now can actually delete it from this cgroup list */
list_del(&pl->node);
- bpf_cgroup_storages_unlink(pl->storage);
- bpf_cgroup_storages_free(pl->storage);
kfree(pl);
if (list_empty(progs))
/* last program was detached, reset flags to zero */
@@ -803,6 +814,7 @@ static void bpf_cgroup_link_release(struct bpf_link *link)
{
struct bpf_cgroup_link *cg_link =
container_of(link, struct bpf_cgroup_link, link);
+ struct cgroup *cg;
/* link might have been auto-detached by dying cgroup already,
* in that case our work is done here
@@ -821,8 +833,12 @@ static void bpf_cgroup_link_release(struct bpf_link *link)
WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
cg_link->type));
+ cg = cg_link->cgroup;
+ cg_link->cgroup = NULL;
+
mutex_unlock(&cgroup_mutex);
- cgroup_put(cg_link->cgroup);
+
+ cgroup_put(cg);
}
static void bpf_cgroup_link_dealloc(struct bpf_link *link)
@@ -833,6 +849,13 @@ static void bpf_cgroup_link_dealloc(struct bpf_link *link)
kfree(cg_link);
}
+static int bpf_cgroup_link_detach(struct bpf_link *link)
+{
+ bpf_cgroup_link_release(link);
+
+ return 0;
+}
+
static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
struct seq_file *seq)
{
@@ -872,6 +895,7 @@ static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
static const struct bpf_link_ops bpf_cgroup_link_lops = {
.release = bpf_cgroup_link_release,
.dealloc = bpf_cgroup_link_dealloc,
+ .detach = bpf_cgroup_link_detach,
.update_prog = cgroup_bpf_replace,
.show_fdinfo = bpf_cgroup_link_show_fdinfo,
.fill_link_info = bpf_cgroup_link_fill_link_info,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 9df4cc9a2907..ed0b3578867c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1958,6 +1958,61 @@ void bpf_prog_array_delete_safe(struct bpf_prog_array *array,
}
}
+/**
+ * bpf_prog_array_delete_safe_at() - Replaces the program at the given
+ * index into the program array with
+ * a dummy no-op program.
+ * @array: a bpf_prog_array
+ * @index: the index of the program to replace
+ *
+ * Skips over dummy programs, by not counting them, when calculating
+ * the position of the program to replace.
+ *
+ * Return:
+ * * 0 - Success
+ * * -EINVAL - Invalid index value. Must be a non-negative integer.
+ * * -ENOENT - Index out of range
+ */
+int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index)
+{
+ return bpf_prog_array_update_at(array, index, &dummy_bpf_prog.prog);
+}
+
+/**
+ * bpf_prog_array_update_at() - Updates the program at the given index
+ * into the program array.
+ * @array: a bpf_prog_array
+ * @index: the index of the program to update
+ * @prog: the program to insert into the array
+ *
+ * Skips over dummy programs, by not counting them, when calculating
+ * the position of the program to update.
+ *
+ * Return:
+ * * 0 - Success
+ * * -EINVAL - Invalid index value. Must be a non-negative integer.
+ * * -ENOENT - Index out of range
+ */
+int bpf_prog_array_update_at(struct bpf_prog_array *array, int index,
+ struct bpf_prog *prog)
+{
+ struct bpf_prog_array_item *item;
+
+ if (unlikely(index < 0))
+ return -EINVAL;
+
+ for (item = array->items; item->prog; item++) {
+ if (item->prog == &dummy_bpf_prog.prog)
+ continue;
+ if (!index) {
+ WRITE_ONCE(item->prog, prog);
+ return 0;
+ }
+ index--;
+ }
+ return -ENOENT;
+}
+
int bpf_prog_array_copy(struct bpf_prog_array *old_array,
struct bpf_prog *exclude_prog,
struct bpf_prog *include_prog,
@@ -2042,24 +2097,12 @@ int bpf_prog_array_copy_info(struct bpf_prog_array *array,
: 0;
}
-static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux)
-{
- enum bpf_cgroup_storage_type stype;
-
- for_each_cgroup_storage_type(stype) {
- if (!aux->cgroup_storage[stype])
- continue;
- bpf_cgroup_storage_release(aux, aux->cgroup_storage[stype]);
- }
-}
-
void __bpf_free_used_maps(struct bpf_prog_aux *aux,
struct bpf_map **used_maps, u32 len)
{
struct bpf_map *map;
u32 i;
- bpf_free_cgroup_storage(aux);
for (i = 0; i < len; i++) {
map = used_maps[i];
if (map->ops->map_poke_untrack)
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 27595fc6da56..f1c46529929b 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -52,7 +52,6 @@ struct xdp_bulk_queue {
struct bpf_cpu_map_entry {
u32 cpu; /* kthread CPU and map index */
int map_id; /* Back reference to map */
- u32 qsize; /* Queue size placeholder for map lookup */
/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
struct xdp_bulk_queue __percpu *bulkq;
@@ -62,10 +61,14 @@ struct bpf_cpu_map_entry {
/* Queue with potential multi-producers, and single-consumer kthread */
struct ptr_ring *queue;
struct task_struct *kthread;
- struct work_struct kthread_stop_wq;
+
+ struct bpf_cpumap_val value;
+ struct bpf_prog *prog;
atomic_t refcnt; /* Control when this struct can be free'ed */
struct rcu_head rcu;
+
+ struct work_struct kthread_stop_wq;
};
struct bpf_cpu_map {
@@ -80,6 +83,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq);
static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
{
+ u32 value_size = attr->value_size;
struct bpf_cpu_map *cmap;
int err = -ENOMEM;
u64 cost;
@@ -90,7 +94,9 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+ (value_size != offsetofend(struct bpf_cpumap_val, qsize) &&
+ value_size != offsetofend(struct bpf_cpumap_val, bpf_prog.fd)) ||
+ attr->map_flags & ~BPF_F_NUMA_NODE)
return ERR_PTR(-EINVAL);
cmap = kzalloc(sizeof(*cmap), GFP_USER);
@@ -212,6 +218,8 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
{
if (atomic_dec_and_test(&rcpu->refcnt)) {
+ if (rcpu->prog)
+ bpf_prog_put(rcpu->prog);
/* The queue should be empty at this point */
__cpu_map_ring_cleanup(rcpu->queue);
ptr_ring_cleanup(rcpu->queue, NULL);
@@ -220,6 +228,75 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
}
}
+static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
+ void **frames, int n,
+ struct xdp_cpumap_stats *stats)
+{
+ struct xdp_rxq_info rxq;
+ struct xdp_buff xdp;
+ int i, nframes = 0;
+
+ if (!rcpu->prog)
+ return n;
+
+ rcu_read_lock_bh();
+
+ xdp_set_return_frame_no_direct();
+ xdp.rxq = &rxq;
+
+ for (i = 0; i < n; i++) {
+ struct xdp_frame *xdpf = frames[i];
+ u32 act;
+ int err;
+
+ rxq.dev = xdpf->dev_rx;
+ rxq.mem = xdpf->mem;
+ /* TODO: report queue_index to xdp_rxq_info */
+
+ xdp_convert_frame_to_buff(xdpf, &xdp);
+
+ act = bpf_prog_run_xdp(rcpu->prog, &xdp);
+ switch (act) {
+ case XDP_PASS:
+ err = xdp_update_frame_from_buff(&xdp, xdpf);
+ if (err < 0) {
+ xdp_return_frame(xdpf);
+ stats->drop++;
+ } else {
+ frames[nframes++] = xdpf;
+ stats->pass++;
+ }
+ break;
+ case XDP_REDIRECT:
+ err = xdp_do_redirect(xdpf->dev_rx, &xdp,
+ rcpu->prog);
+ if (unlikely(err)) {
+ xdp_return_frame(xdpf);
+ stats->drop++;
+ } else {
+ stats->redirect++;
+ }
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ /* fallthrough */
+ case XDP_DROP:
+ xdp_return_frame(xdpf);
+ stats->drop++;
+ break;
+ }
+ }
+
+ if (stats->redirect)
+ xdp_do_flush_map();
+
+ xdp_clear_return_frame_no_direct();
+
+ rcu_read_unlock_bh(); /* resched point, may call do_softirq() */
+
+ return nframes;
+}
+
#define CPUMAP_BATCH 8
static int cpu_map_kthread_run(void *data)
@@ -234,11 +311,12 @@ static int cpu_map_kthread_run(void *data)
* kthread_stop signal until queue is empty.
*/
while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
+ struct xdp_cpumap_stats stats = {}; /* zero stats */
+ gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
unsigned int drops = 0, sched = 0;
void *frames[CPUMAP_BATCH];
void *skbs[CPUMAP_BATCH];
- gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
- int i, n, m;
+ int i, n, m, nframes;
/* Release CPU reschedule checks */
if (__ptr_ring_empty(rcpu->queue)) {
@@ -259,8 +337,8 @@ static int cpu_map_kthread_run(void *data)
* kthread CPU pinned. Lockless access to ptr_ring
* consume side valid as no-resize allowed of queue.
*/
- n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH);
-
+ n = __ptr_ring_consume_batched(rcpu->queue, frames,
+ CPUMAP_BATCH);
for (i = 0; i < n; i++) {
void *f = frames[i];
struct page *page = virt_to_page(f);
@@ -272,15 +350,19 @@ static int cpu_map_kthread_run(void *data)
prefetchw(page);
}
- m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs);
- if (unlikely(m == 0)) {
- for (i = 0; i < n; i++)
- skbs[i] = NULL; /* effect: xdp_return_frame */
- drops = n;
+ /* Support running another XDP prog on this CPU */
+ nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, n, &stats);
+ if (nframes) {
+ m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs);
+ if (unlikely(m == 0)) {
+ for (i = 0; i < nframes; i++)
+ skbs[i] = NULL; /* effect: xdp_return_frame */
+ drops += nframes;
+ }
}
local_bh_disable();
- for (i = 0; i < n; i++) {
+ for (i = 0; i < nframes; i++) {
struct xdp_frame *xdpf = frames[i];
struct sk_buff *skb = skbs[i];
int ret;
@@ -297,7 +379,7 @@ static int cpu_map_kthread_run(void *data)
drops++;
}
/* Feedback loop via tracepoint */
- trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched);
+ trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched, &stats);
local_bh_enable(); /* resched point, may call do_softirq() */
}
@@ -307,13 +389,38 @@ static int cpu_map_kthread_run(void *data)
return 0;
}
-static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
- int map_id)
+bool cpu_map_prog_allowed(struct bpf_map *map)
{
+ return map->map_type == BPF_MAP_TYPE_CPUMAP &&
+ map->value_size != offsetofend(struct bpf_cpumap_val, qsize);
+}
+
+static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd)
+{
+ struct bpf_prog *prog;
+
+ prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (prog->expected_attach_type != BPF_XDP_CPUMAP) {
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ rcpu->value.bpf_prog.id = prog->aux->id;
+ rcpu->prog = prog;
+
+ return 0;
+}
+
+static struct bpf_cpu_map_entry *
+__cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id)
+{
+ int numa, err, i, fd = value->bpf_prog.fd;
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct bpf_cpu_map_entry *rcpu;
struct xdp_bulk_queue *bq;
- int numa, err, i;
/* Have map->numa_node, but choose node of redirect target CPU */
numa = cpu_to_node(cpu);
@@ -338,19 +445,22 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
if (!rcpu->queue)
goto free_bulkq;
- err = ptr_ring_init(rcpu->queue, qsize, gfp);
+ err = ptr_ring_init(rcpu->queue, value->qsize, gfp);
if (err)
goto free_queue;
rcpu->cpu = cpu;
rcpu->map_id = map_id;
- rcpu->qsize = qsize;
+ rcpu->value.qsize = value->qsize;
+
+ if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd))
+ goto free_ptr_ring;
/* Setup kthread */
rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
"cpumap/%d/map:%d", cpu, map_id);
if (IS_ERR(rcpu->kthread))
- goto free_ptr_ring;
+ goto free_prog;
get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */
get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */
@@ -361,6 +471,9 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
return rcpu;
+free_prog:
+ if (rcpu->prog)
+ bpf_prog_put(rcpu->prog);
free_ptr_ring:
ptr_ring_cleanup(rcpu->queue, NULL);
free_queue:
@@ -437,12 +550,12 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ struct bpf_cpumap_val cpumap_value = {};
struct bpf_cpu_map_entry *rcpu;
-
/* Array index key correspond to CPU number */
u32 key_cpu = *(u32 *)key;
- /* Value is the queue size */
- u32 qsize = *(u32 *)value;
+
+ memcpy(&cpumap_value, value, map->value_size);
if (unlikely(map_flags > BPF_EXIST))
return -EINVAL;
@@ -450,18 +563,18 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
return -E2BIG;
if (unlikely(map_flags == BPF_NOEXIST))
return -EEXIST;
- if (unlikely(qsize > 16384)) /* sanity limit on qsize */
+ if (unlikely(cpumap_value.qsize > 16384)) /* sanity limit on qsize */
return -EOVERFLOW;
/* Make sure CPU is a valid possible cpu */
if (key_cpu >= nr_cpumask_bits || !cpu_possible(key_cpu))
return -ENODEV;
- if (qsize == 0) {
+ if (cpumap_value.qsize == 0) {
rcpu = NULL; /* Same as deleting */
} else {
/* Updating qsize cause re-allocation of bpf_cpu_map_entry */
- rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
+ rcpu = __cpu_map_entry_alloc(&cpumap_value, key_cpu, map->id);
if (!rcpu)
return -ENOMEM;
rcpu->cmap = cmap;
@@ -523,7 +636,7 @@ static void *cpu_map_lookup_elem(struct bpf_map *map, void *key)
struct bpf_cpu_map_entry *rcpu =
__cpu_map_lookup_elem(map, *(u32 *)key);
- return rcpu ? &rcpu->qsize : NULL;
+ return rcpu ? &rcpu->value : NULL;
}
static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
@@ -543,6 +656,7 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
+static int cpu_map_btf_id;
const struct bpf_map_ops cpu_map_ops = {
.map_alloc = cpu_map_alloc,
.map_free = cpu_map_free,
@@ -551,6 +665,8 @@ const struct bpf_map_ops cpu_map_ops = {
.map_lookup_elem = cpu_map_lookup_elem,
.map_get_next_key = cpu_map_get_next_key,
.map_check_btf = map_check_no_btf,
+ .map_btf_name = "bpf_cpu_map",
+ .map_btf_id = &cpu_map_btf_id,
};
static int bq_flush_to_queue(struct xdp_bulk_queue *bq)
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 5fdbc776a760..10abb06065bb 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -749,6 +749,7 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
map, key, value, map_flags);
}
+static int dev_map_btf_id;
const struct bpf_map_ops dev_map_ops = {
.map_alloc = dev_map_alloc,
.map_free = dev_map_free,
@@ -757,8 +758,11 @@ const struct bpf_map_ops dev_map_ops = {
.map_update_elem = dev_map_update_elem,
.map_delete_elem = dev_map_delete_elem,
.map_check_btf = map_check_no_btf,
+ .map_btf_name = "bpf_dtab",
+ .map_btf_id = &dev_map_btf_id,
};
+static int dev_map_hash_map_btf_id;
const struct bpf_map_ops dev_map_hash_ops = {
.map_alloc = dev_map_alloc,
.map_free = dev_map_free,
@@ -767,6 +771,8 @@ const struct bpf_map_ops dev_map_hash_ops = {
.map_update_elem = dev_map_hash_update_elem,
.map_delete_elem = dev_map_hash_delete_elem,
.map_check_btf = map_check_no_btf,
+ .map_btf_name = "bpf_dtab",
+ .map_btf_id = &dev_map_hash_map_btf_id,
};
static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab,
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index b32cc8ce8ff6..78dfff6a501b 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1296,12 +1296,10 @@ static void htab_map_free(struct bpf_map *map)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
- * so the programs (can be more than one that used this map) were
- * disconnected from events. Wait for outstanding critical sections in
- * these programs to complete
+ /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
+ * bpf_free_used_maps() is called after bpf prog is no longer executing.
+ * There is no need to synchronize_rcu() here to protect map elements.
*/
- synchronize_rcu();
/* some of free_htab_elem() callbacks for elements of this map may
* not have executed. Wait for them.
@@ -1620,6 +1618,197 @@ htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
true, false);
}
+struct bpf_iter_seq_hash_map_info {
+ struct bpf_map *map;
+ struct bpf_htab *htab;
+ void *percpu_value_buf; // non-zero means percpu hash
+ unsigned long flags;
+ u32 bucket_id;
+ u32 skip_elems;
+};
+
+static struct htab_elem *
+bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
+ struct htab_elem *prev_elem)
+{
+ const struct bpf_htab *htab = info->htab;
+ unsigned long flags = info->flags;
+ u32 skip_elems = info->skip_elems;
+ u32 bucket_id = info->bucket_id;
+ struct hlist_nulls_head *head;
+ struct hlist_nulls_node *n;
+ struct htab_elem *elem;
+ struct bucket *b;
+ u32 i, count;
+
+ if (bucket_id >= htab->n_buckets)
+ return NULL;
+
+ /* try to find next elem in the same bucket */
+ if (prev_elem) {
+ /* no update/deletion on this bucket, prev_elem should be still valid
+ * and we won't skip elements.
+ */
+ n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node));
+ elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node);
+ if (elem)
+ return elem;
+
+ /* not found, unlock and go to the next bucket */
+ b = &htab->buckets[bucket_id++];
+ htab_unlock_bucket(htab, b, flags);
+ skip_elems = 0;
+ }
+
+ for (i = bucket_id; i < htab->n_buckets; i++) {
+ b = &htab->buckets[i];
+ flags = htab_lock_bucket(htab, b);
+
+ count = 0;
+ head = &b->head;
+ hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
+ if (count >= skip_elems) {
+ info->flags = flags;
+ info->bucket_id = i;
+ info->skip_elems = count;
+ return elem;
+ }
+ count++;
+ }
+
+ htab_unlock_bucket(htab, b, flags);
+ skip_elems = 0;
+ }
+
+ info->bucket_id = i;
+ info->skip_elems = 0;
+ return NULL;
+}
+
+static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_hash_map_info *info = seq->private;
+ struct htab_elem *elem;
+
+ elem = bpf_hash_map_seq_find_next(info, NULL);
+ if (!elem)
+ return NULL;
+
+ if (*pos == 0)
+ ++*pos;
+ return elem;
+}
+
+static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_hash_map_info *info = seq->private;
+
+ ++*pos;
+ ++info->skip_elems;
+ return bpf_hash_map_seq_find_next(info, v);
+}
+
+static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
+{
+ struct bpf_iter_seq_hash_map_info *info = seq->private;
+ u32 roundup_key_size, roundup_value_size;
+ struct bpf_iter__bpf_map_elem ctx = {};
+ struct bpf_map *map = info->map;
+ struct bpf_iter_meta meta;
+ int ret = 0, off = 0, cpu;
+ struct bpf_prog *prog;
+ void __percpu *pptr;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, elem == NULL);
+ if (prog) {
+ ctx.meta = &meta;
+ ctx.map = info->map;
+ if (elem) {
+ roundup_key_size = round_up(map->key_size, 8);
+ ctx.key = elem->key;
+ if (!info->percpu_value_buf) {
+ ctx.value = elem->key + roundup_key_size;
+ } else {
+ roundup_value_size = round_up(map->value_size, 8);
+ pptr = htab_elem_get_ptr(elem, map->key_size);
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(info->percpu_value_buf + off,
+ per_cpu_ptr(pptr, cpu),
+ roundup_value_size);
+ off += roundup_value_size;
+ }
+ ctx.value = info->percpu_value_buf;
+ }
+ }
+ ret = bpf_iter_run_prog(prog, &ctx);
+ }
+
+ return ret;
+}
+
+static int bpf_hash_map_seq_show(struct seq_file *seq, void *v)
+{
+ return __bpf_hash_map_seq_show(seq, v);
+}
+
+static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_seq_hash_map_info *info = seq->private;
+
+ if (!v)
+ (void)__bpf_hash_map_seq_show(seq, NULL);
+ else
+ htab_unlock_bucket(info->htab,
+ &info->htab->buckets[info->bucket_id],
+ info->flags);
+}
+
+static int bpf_iter_init_hash_map(void *priv_data,
+ struct bpf_iter_aux_info *aux)
+{
+ struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
+ struct bpf_map *map = aux->map;
+ void *value_buf;
+ u32 buf_size;
+
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ buf_size = round_up(map->value_size, 8) * num_possible_cpus();
+ value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
+ if (!value_buf)
+ return -ENOMEM;
+
+ seq_info->percpu_value_buf = value_buf;
+ }
+
+ seq_info->map = map;
+ seq_info->htab = container_of(map, struct bpf_htab, map);
+ return 0;
+}
+
+static void bpf_iter_fini_hash_map(void *priv_data)
+{
+ struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
+
+ kfree(seq_info->percpu_value_buf);
+}
+
+static const struct seq_operations bpf_hash_map_seq_ops = {
+ .start = bpf_hash_map_seq_start,
+ .next = bpf_hash_map_seq_next,
+ .stop = bpf_hash_map_seq_stop,
+ .show = bpf_hash_map_seq_show,
+};
+
+static const struct bpf_iter_seq_info iter_seq_info = {
+ .seq_ops = &bpf_hash_map_seq_ops,
+ .init_seq_private = bpf_iter_init_hash_map,
+ .fini_seq_private = bpf_iter_fini_hash_map,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info),
+};
+
+static int htab_map_btf_id;
const struct bpf_map_ops htab_map_ops = {
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
@@ -1631,8 +1820,12 @@ const struct bpf_map_ops htab_map_ops = {
.map_gen_lookup = htab_map_gen_lookup,
.map_seq_show_elem = htab_map_seq_show_elem,
BATCH_OPS(htab),
+ .map_btf_name = "bpf_htab",
+ .map_btf_id = &htab_map_btf_id,
+ .iter_seq_info = &iter_seq_info,
};
+static int htab_lru_map_btf_id;
const struct bpf_map_ops htab_lru_map_ops = {
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
@@ -1645,6 +1838,9 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_gen_lookup = htab_lru_map_gen_lookup,
.map_seq_show_elem = htab_map_seq_show_elem,
BATCH_OPS(htab_lru),
+ .map_btf_name = "bpf_htab",
+ .map_btf_id = &htab_lru_map_btf_id,
+ .iter_seq_info = &iter_seq_info,
};
/* Called from eBPF program */
@@ -1749,6 +1945,7 @@ static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}
+static int htab_percpu_map_btf_id;
const struct bpf_map_ops htab_percpu_map_ops = {
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
@@ -1759,8 +1956,12 @@ const struct bpf_map_ops htab_percpu_map_ops = {
.map_delete_elem = htab_map_delete_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
BATCH_OPS(htab_percpu),
+ .map_btf_name = "bpf_htab",
+ .map_btf_id = &htab_percpu_map_btf_id,
+ .iter_seq_info = &iter_seq_info,
};
+static int htab_lru_percpu_map_btf_id;
const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
@@ -1771,6 +1972,9 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_delete_elem = htab_lru_map_delete_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
BATCH_OPS(htab_lru_percpu),
+ .map_btf_name = "bpf_htab",
+ .map_btf_id = &htab_lru_percpu_map_btf_id,
+ .iter_seq_info = &iter_seq_info,
};
static int fd_htab_map_alloc_check(union bpf_attr *attr)
@@ -1893,6 +2097,7 @@ static void htab_of_map_free(struct bpf_map *map)
fd_htab_map_free(map);
}
+static int htab_of_maps_map_btf_id;
const struct bpf_map_ops htab_of_maps_map_ops = {
.map_alloc_check = fd_htab_map_alloc_check,
.map_alloc = htab_of_map_alloc,
@@ -1905,4 +2110,6 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
.map_gen_lookup = htab_of_map_gen_lookup,
.map_check_btf = map_check_no_btf,
+ .map_btf_name = "bpf_htab",
+ .map_btf_id = &htab_of_maps_map_btf_id,
};
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 33d01866bcc2..571bb351ed3b 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -13,6 +13,8 @@ DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STO
#ifdef CONFIG_CGROUP_BPF
+#include "../cgroup/cgroup-internal.h"
+
#define LOCAL_STORAGE_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
@@ -20,7 +22,6 @@ struct bpf_cgroup_storage_map {
struct bpf_map map;
spinlock_t lock;
- struct bpf_prog_aux *aux;
struct rb_root root;
struct list_head list;
};
@@ -30,24 +31,41 @@ static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map)
return container_of(map, struct bpf_cgroup_storage_map, map);
}
-static int bpf_cgroup_storage_key_cmp(
- const struct bpf_cgroup_storage_key *key1,
- const struct bpf_cgroup_storage_key *key2)
+static bool attach_type_isolated(const struct bpf_map *map)
{
- if (key1->cgroup_inode_id < key2->cgroup_inode_id)
- return -1;
- else if (key1->cgroup_inode_id > key2->cgroup_inode_id)
- return 1;
- else if (key1->attach_type < key2->attach_type)
- return -1;
- else if (key1->attach_type > key2->attach_type)
- return 1;
+ return map->key_size == sizeof(struct bpf_cgroup_storage_key);
+}
+
+static int bpf_cgroup_storage_key_cmp(const struct bpf_cgroup_storage_map *map,
+ const void *_key1, const void *_key2)
+{
+ if (attach_type_isolated(&map->map)) {
+ const struct bpf_cgroup_storage_key *key1 = _key1;
+ const struct bpf_cgroup_storage_key *key2 = _key2;
+
+ if (key1->cgroup_inode_id < key2->cgroup_inode_id)
+ return -1;
+ else if (key1->cgroup_inode_id > key2->cgroup_inode_id)
+ return 1;
+ else if (key1->attach_type < key2->attach_type)
+ return -1;
+ else if (key1->attach_type > key2->attach_type)
+ return 1;
+ } else {
+ const __u64 *cgroup_inode_id1 = _key1;
+ const __u64 *cgroup_inode_id2 = _key2;
+
+ if (*cgroup_inode_id1 < *cgroup_inode_id2)
+ return -1;
+ else if (*cgroup_inode_id1 > *cgroup_inode_id2)
+ return 1;
+ }
return 0;
}
-static struct bpf_cgroup_storage *cgroup_storage_lookup(
- struct bpf_cgroup_storage_map *map, struct bpf_cgroup_storage_key *key,
- bool locked)
+struct bpf_cgroup_storage *
+cgroup_storage_lookup(struct bpf_cgroup_storage_map *map,
+ void *key, bool locked)
{
struct rb_root *root = &map->root;
struct rb_node *node;
@@ -61,7 +79,7 @@ static struct bpf_cgroup_storage *cgroup_storage_lookup(
storage = container_of(node, struct bpf_cgroup_storage, node);
- switch (bpf_cgroup_storage_key_cmp(key, &storage->key)) {
+ switch (bpf_cgroup_storage_key_cmp(map, key, &storage->key)) {
case -1:
node = node->rb_left;
break;
@@ -93,7 +111,7 @@ static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map,
this = container_of(*new, struct bpf_cgroup_storage, node);
parent = *new;
- switch (bpf_cgroup_storage_key_cmp(&storage->key, &this->key)) {
+ switch (bpf_cgroup_storage_key_cmp(map, &storage->key, &this->key)) {
case -1:
new = &((*new)->rb_left);
break;
@@ -111,10 +129,9 @@ static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map,
return 0;
}
-static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key)
+static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key)
{
struct bpf_cgroup_storage_map *map = map_to_storage(_map);
- struct bpf_cgroup_storage_key *key = _key;
struct bpf_cgroup_storage *storage;
storage = cgroup_storage_lookup(map, key, false);
@@ -124,17 +141,13 @@ static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key)
return &READ_ONCE(storage->buf)->data[0];
}
-static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
+static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
void *value, u64 flags)
{
- struct bpf_cgroup_storage_key *key = _key;
struct bpf_cgroup_storage *storage;
struct bpf_storage_buffer *new;
- if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST)))
- return -EINVAL;
-
- if (unlikely(flags & BPF_NOEXIST))
+ if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST)))
return -EINVAL;
if (unlikely((flags & BPF_F_LOCK) &&
@@ -167,11 +180,10 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
return 0;
}
-int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key,
+int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key,
void *value)
{
struct bpf_cgroup_storage_map *map = map_to_storage(_map);
- struct bpf_cgroup_storage_key *key = _key;
struct bpf_cgroup_storage *storage;
int cpu, off = 0;
u32 size;
@@ -197,11 +209,10 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key,
return 0;
}
-int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key,
+int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key,
void *value, u64 map_flags)
{
struct bpf_cgroup_storage_map *map = map_to_storage(_map);
- struct bpf_cgroup_storage_key *key = _key;
struct bpf_cgroup_storage *storage;
int cpu, off = 0;
u32 size;
@@ -232,12 +243,10 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key,
return 0;
}
-static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key,
+static int cgroup_storage_get_next_key(struct bpf_map *_map, void *key,
void *_next_key)
{
struct bpf_cgroup_storage_map *map = map_to_storage(_map);
- struct bpf_cgroup_storage_key *key = _key;
- struct bpf_cgroup_storage_key *next = _next_key;
struct bpf_cgroup_storage *storage;
spin_lock_bh(&map->lock);
@@ -250,17 +259,23 @@ static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key,
if (!storage)
goto enoent;
- storage = list_next_entry(storage, list);
+ storage = list_next_entry(storage, list_map);
if (!storage)
goto enoent;
} else {
storage = list_first_entry(&map->list,
- struct bpf_cgroup_storage, list);
+ struct bpf_cgroup_storage, list_map);
}
spin_unlock_bh(&map->lock);
- next->attach_type = storage->key.attach_type;
- next->cgroup_inode_id = storage->key.cgroup_inode_id;
+
+ if (attach_type_isolated(&map->map)) {
+ struct bpf_cgroup_storage_key *next = _next_key;
+ *next = storage->key;
+ } else {
+ __u64 *next = _next_key;
+ *next = storage->key.cgroup_inode_id;
+ }
return 0;
enoent:
@@ -275,7 +290,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
struct bpf_map_memory mem;
int ret;
- if (attr->key_size != sizeof(struct bpf_cgroup_storage_key))
+ if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) &&
+ attr->key_size != sizeof(__u64))
return ERR_PTR(-EINVAL);
if (attr->value_size == 0)
@@ -318,6 +334,17 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
static void cgroup_storage_map_free(struct bpf_map *_map)
{
struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+ struct list_head *storages = &map->list;
+ struct bpf_cgroup_storage *storage, *stmp;
+
+ mutex_lock(&cgroup_mutex);
+
+ list_for_each_entry_safe(storage, stmp, storages, list_map) {
+ bpf_cgroup_storage_unlink(storage);
+ bpf_cgroup_storage_free(storage);
+ }
+
+ mutex_unlock(&cgroup_mutex);
WARN_ON(!RB_EMPTY_ROOT(&map->root));
WARN_ON(!list_empty(&map->list));
@@ -335,49 +362,63 @@ static int cgroup_storage_check_btf(const struct bpf_map *map,
const struct btf_type *key_type,
const struct btf_type *value_type)
{
- struct btf_member *m;
- u32 offset, size;
-
- /* Key is expected to be of struct bpf_cgroup_storage_key type,
- * which is:
- * struct bpf_cgroup_storage_key {
- * __u64 cgroup_inode_id;
- * __u32 attach_type;
- * };
- */
+ if (attach_type_isolated(map)) {
+ struct btf_member *m;
+ u32 offset, size;
+
+ /* Key is expected to be of struct bpf_cgroup_storage_key type,
+ * which is:
+ * struct bpf_cgroup_storage_key {
+ * __u64 cgroup_inode_id;
+ * __u32 attach_type;
+ * };
+ */
+
+ /*
+ * Key_type must be a structure with two fields.
+ */
+ if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ||
+ BTF_INFO_VLEN(key_type->info) != 2)
+ return -EINVAL;
+
+ /*
+ * The first field must be a 64 bit integer at 0 offset.
+ */
+ m = (struct btf_member *)(key_type + 1);
+ size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id);
+ if (!btf_member_is_reg_int(btf, key_type, m, 0, size))
+ return -EINVAL;
+
+ /*
+ * The second field must be a 32 bit integer at 64 bit offset.
+ */
+ m++;
+ offset = offsetof(struct bpf_cgroup_storage_key, attach_type);
+ size = sizeof_field(struct bpf_cgroup_storage_key, attach_type);
+ if (!btf_member_is_reg_int(btf, key_type, m, offset, size))
+ return -EINVAL;
+ } else {
+ u32 int_data;
- /*
- * Key_type must be a structure with two fields.
- */
- if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ||
- BTF_INFO_VLEN(key_type->info) != 2)
- return -EINVAL;
+ /*
+ * Key is expected to be u64, which stores the cgroup_inode_id
+ */
- /*
- * The first field must be a 64 bit integer at 0 offset.
- */
- m = (struct btf_member *)(key_type + 1);
- size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id);
- if (!btf_member_is_reg_int(btf, key_type, m, 0, size))
- return -EINVAL;
+ if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
+ return -EINVAL;
- /*
- * The second field must be a 32 bit integer at 64 bit offset.
- */
- m++;
- offset = offsetof(struct bpf_cgroup_storage_key, attach_type);
- size = sizeof_field(struct bpf_cgroup_storage_key, attach_type);
- if (!btf_member_is_reg_int(btf, key_type, m, offset, size))
- return -EINVAL;
+ int_data = *(u32 *)(key_type + 1);
+ if (BTF_INT_BITS(int_data) != 64 || BTF_INT_OFFSET(int_data))
+ return -EINVAL;
+ }
return 0;
}
-static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *_key,
+static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key,
struct seq_file *m)
{
enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
- struct bpf_cgroup_storage_key *key = _key;
struct bpf_cgroup_storage *storage;
int cpu;
@@ -409,6 +450,7 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *_key,
rcu_read_unlock();
}
+static int cgroup_storage_map_btf_id;
const struct bpf_map_ops cgroup_storage_map_ops = {
.map_alloc = cgroup_storage_map_alloc,
.map_free = cgroup_storage_map_free,
@@ -418,43 +460,20 @@ const struct bpf_map_ops cgroup_storage_map_ops = {
.map_delete_elem = cgroup_storage_delete_elem,
.map_check_btf = cgroup_storage_check_btf,
.map_seq_show_elem = cgroup_storage_seq_show_elem,
+ .map_btf_name = "bpf_cgroup_storage_map",
+ .map_btf_id = &cgroup_storage_map_btf_id,
};
int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map)
{
enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map);
- struct bpf_cgroup_storage_map *map = map_to_storage(_map);
- int ret = -EBUSY;
-
- spin_lock_bh(&map->lock);
- if (map->aux && map->aux != aux)
- goto unlock;
if (aux->cgroup_storage[stype] &&
aux->cgroup_storage[stype] != _map)
- goto unlock;
+ return -EBUSY;
- map->aux = aux;
aux->cgroup_storage[stype] = _map;
- ret = 0;
-unlock:
- spin_unlock_bh(&map->lock);
-
- return ret;
-}
-
-void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, struct bpf_map *_map)
-{
- enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map);
- struct bpf_cgroup_storage_map *map = map_to_storage(_map);
-
- spin_lock_bh(&map->lock);
- if (map->aux == aux) {
- WARN_ON(aux->cgroup_storage[stype] != _map);
- map->aux = NULL;
- aux->cgroup_storage[stype] = NULL;
- }
- spin_unlock_bh(&map->lock);
+ return 0;
}
static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages)
@@ -575,7 +594,8 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
spin_lock_bh(&map->lock);
WARN_ON(cgroup_storage_insert(map, storage));
- list_add(&storage->list, &map->list);
+ list_add(&storage->list_map, &map->list);
+ list_add(&storage->list_cg, &cgroup->bpf.storages);
spin_unlock_bh(&map->lock);
}
@@ -593,7 +613,8 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage)
root = &map->root;
rb_erase(&storage->node, root);
- list_del(&storage->list);
+ list_del(&storage->list_map);
+ list_del(&storage->list_cg);
spin_unlock_bh(&map->lock);
}
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index c8cc4e4cf98d..44474bf3ab7a 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -589,11 +589,6 @@ static void trie_free(struct bpf_map *map)
struct lpm_trie_node __rcu **slot;
struct lpm_trie_node *node;
- /* Wait for outstanding programs to complete
- * update/lookup/delete/get_next_key and free the trie.
- */
- synchronize_rcu();
-
/* Always start at the root and walk down to a node that has no
* children. Then free that node, nullify its reference in the parent
* and start over.
@@ -735,6 +730,7 @@ static int trie_check_btf(const struct bpf_map *map,
-EINVAL : 0;
}
+static int trie_map_btf_id;
const struct bpf_map_ops trie_map_ops = {
.map_alloc = trie_alloc,
.map_free = trie_free,
@@ -743,4 +739,6 @@ const struct bpf_map_ops trie_map_ops = {
.map_update_elem = trie_update_elem,
.map_delete_elem = trie_delete_elem,
.map_check_btf = trie_check_btf,
+ .map_btf_name = "lpm_trie",
+ .map_btf_id = &trie_map_btf_id,
};
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index c69071e334bf..af86048e5afd 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -4,9 +4,10 @@
#include <linux/fs.h>
#include <linux/filter.h>
#include <linux/kernel.h>
+#include <linux/btf_ids.h>
struct bpf_iter_seq_map_info {
- u32 mid;
+ u32 map_id;
};
static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos)
@@ -14,27 +15,23 @@ static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos)
struct bpf_iter_seq_map_info *info = seq->private;
struct bpf_map *map;
- map = bpf_map_get_curr_or_next(&info->mid);
+ map = bpf_map_get_curr_or_next(&info->map_id);
if (!map)
return NULL;
- ++*pos;
+ if (*pos == 0)
+ ++*pos;
return map;
}
static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct bpf_iter_seq_map_info *info = seq->private;
- struct bpf_map *map;
++*pos;
- ++info->mid;
+ ++info->map_id;
bpf_map_put((struct bpf_map *)v);
- map = bpf_map_get_curr_or_next(&info->mid);
- if (!map)
- return NULL;
-
- return map;
+ return bpf_map_get_curr_or_next(&info->map_id);
}
struct bpf_iter__bpf_map {
@@ -81,22 +78,103 @@ static const struct seq_operations bpf_map_seq_ops = {
.show = bpf_map_seq_show,
};
-static const struct bpf_iter_reg bpf_map_reg_info = {
- .target = "bpf_map",
+BTF_ID_LIST(btf_bpf_map_id)
+BTF_ID(struct, bpf_map)
+
+static const struct bpf_iter_seq_info bpf_map_seq_info = {
.seq_ops = &bpf_map_seq_ops,
.init_seq_private = NULL,
.fini_seq_private = NULL,
.seq_priv_size = sizeof(struct bpf_iter_seq_map_info),
+};
+
+static struct bpf_iter_reg bpf_map_reg_info = {
+ .target = "bpf_map",
.ctx_arg_info_size = 1,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__bpf_map, map),
PTR_TO_BTF_ID_OR_NULL },
},
+ .seq_info = &bpf_map_seq_info,
+};
+
+static int bpf_iter_attach_map(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ u32 key_acc_size, value_acc_size, key_size, value_size;
+ struct bpf_map *map;
+ bool is_percpu = false;
+ int err = -EINVAL;
+
+ if (!linfo->map.map_fd)
+ return -EBADF;
+
+ map = bpf_map_get_with_uref(linfo->map.map_fd);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+ is_percpu = true;
+ else if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY)
+ goto put_map;
+
+ key_acc_size = prog->aux->max_rdonly_access;
+ value_acc_size = prog->aux->max_rdwr_access;
+ key_size = map->key_size;
+ if (!is_percpu)
+ value_size = map->value_size;
+ else
+ value_size = round_up(map->value_size, 8) * num_possible_cpus();
+
+ if (key_acc_size > key_size || value_acc_size > value_size) {
+ err = -EACCES;
+ goto put_map;
+ }
+
+ aux->map = map;
+ return 0;
+
+put_map:
+ bpf_map_put_with_uref(map);
+ return err;
+}
+
+static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux)
+{
+ bpf_map_put_with_uref(aux->map);
+}
+
+DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta,
+ struct bpf_map *map, void *key, void *value)
+
+static const struct bpf_iter_reg bpf_map_elem_reg_info = {
+ .target = "bpf_map_elem",
+ .attach_target = bpf_iter_attach_map,
+ .detach_target = bpf_iter_detach_map,
+ .ctx_arg_info_size = 2,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__bpf_map_elem, key),
+ PTR_TO_RDONLY_BUF_OR_NULL },
+ { offsetof(struct bpf_iter__bpf_map_elem, value),
+ PTR_TO_RDWR_BUF_OR_NULL },
+ },
};
static int __init bpf_map_iter_init(void)
{
- return bpf_iter_reg_target(&bpf_map_reg_info);
+ int ret;
+
+ bpf_map_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_map_id;
+ ret = bpf_iter_reg_target(&bpf_map_reg_info);
+ if (ret)
+ return ret;
+
+ return bpf_iter_reg_target(&bpf_map_elem_reg_info);
}
late_initcall(bpf_map_iter_init);
diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c
index 310241ca7991..542f275bf252 100644
--- a/kernel/bpf/net_namespace.c
+++ b/kernel/bpf/net_namespace.c
@@ -25,6 +25,32 @@ struct bpf_netns_link {
/* Protects updates to netns_bpf */
DEFINE_MUTEX(netns_bpf_mutex);
+static void netns_bpf_attach_type_unneed(enum netns_bpf_attach_type type)
+{
+ switch (type) {
+#ifdef CONFIG_INET
+ case NETNS_BPF_SK_LOOKUP:
+ static_branch_dec(&bpf_sk_lookup_enabled);
+ break;
+#endif
+ default:
+ break;
+ }
+}
+
+static void netns_bpf_attach_type_need(enum netns_bpf_attach_type type)
+{
+ switch (type) {
+#ifdef CONFIG_INET
+ case NETNS_BPF_SK_LOOKUP:
+ static_branch_inc(&bpf_sk_lookup_enabled);
+ break;
+#endif
+ default:
+ break;
+ }
+}
+
/* Must be called with netns_bpf_mutex held. */
static void netns_bpf_run_array_detach(struct net *net,
enum netns_bpf_attach_type type)
@@ -36,12 +62,50 @@ static void netns_bpf_run_array_detach(struct net *net,
bpf_prog_array_free(run_array);
}
+static int link_index(struct net *net, enum netns_bpf_attach_type type,
+ struct bpf_netns_link *link)
+{
+ struct bpf_netns_link *pos;
+ int i = 0;
+
+ list_for_each_entry(pos, &net->bpf.links[type], node) {
+ if (pos == link)
+ return i;
+ i++;
+ }
+ return -ENOENT;
+}
+
+static int link_count(struct net *net, enum netns_bpf_attach_type type)
+{
+ struct list_head *pos;
+ int i = 0;
+
+ list_for_each(pos, &net->bpf.links[type])
+ i++;
+ return i;
+}
+
+static void fill_prog_array(struct net *net, enum netns_bpf_attach_type type,
+ struct bpf_prog_array *prog_array)
+{
+ struct bpf_netns_link *pos;
+ unsigned int i = 0;
+
+ list_for_each_entry(pos, &net->bpf.links[type], node) {
+ prog_array->items[i].prog = pos->link.prog;
+ i++;
+ }
+}
+
static void bpf_netns_link_release(struct bpf_link *link)
{
struct bpf_netns_link *net_link =
container_of(link, struct bpf_netns_link, link);
enum netns_bpf_attach_type type = net_link->netns_type;
+ struct bpf_prog_array *old_array, *new_array;
struct net *net;
+ int cnt, idx;
mutex_lock(&netns_bpf_mutex);
@@ -53,13 +117,41 @@ static void bpf_netns_link_release(struct bpf_link *link)
if (!net)
goto out_unlock;
- netns_bpf_run_array_detach(net, type);
+ /* Mark attach point as unused */
+ netns_bpf_attach_type_unneed(type);
+
+ /* Remember link position in case of safe delete */
+ idx = link_index(net, type, net_link);
list_del(&net_link->node);
+ cnt = link_count(net, type);
+ if (!cnt) {
+ netns_bpf_run_array_detach(net, type);
+ goto out_unlock;
+ }
+
+ old_array = rcu_dereference_protected(net->bpf.run_array[type],
+ lockdep_is_held(&netns_bpf_mutex));
+ new_array = bpf_prog_array_alloc(cnt, GFP_KERNEL);
+ if (!new_array) {
+ WARN_ON(bpf_prog_array_delete_safe_at(old_array, idx));
+ goto out_unlock;
+ }
+ fill_prog_array(net, type, new_array);
+ rcu_assign_pointer(net->bpf.run_array[type], new_array);
+ bpf_prog_array_free(old_array);
+
out_unlock:
+ net_link->net = NULL;
mutex_unlock(&netns_bpf_mutex);
}
+static int bpf_netns_link_detach(struct bpf_link *link)
+{
+ bpf_netns_link_release(link);
+ return 0;
+}
+
static void bpf_netns_link_dealloc(struct bpf_link *link)
{
struct bpf_netns_link *net_link =
@@ -77,7 +169,7 @@ static int bpf_netns_link_update_prog(struct bpf_link *link,
enum netns_bpf_attach_type type = net_link->netns_type;
struct bpf_prog_array *run_array;
struct net *net;
- int ret = 0;
+ int idx, ret;
if (old_prog && old_prog != link->prog)
return -EPERM;
@@ -95,7 +187,10 @@ static int bpf_netns_link_update_prog(struct bpf_link *link,
run_array = rcu_dereference_protected(net->bpf.run_array[type],
lockdep_is_held(&netns_bpf_mutex));
- WRITE_ONCE(run_array->items[0].prog, new_prog);
+ idx = link_index(net, type, net_link);
+ ret = bpf_prog_array_update_at(run_array, idx, new_prog);
+ if (ret)
+ goto out_unlock;
old_prog = xchg(&link->prog, new_prog);
bpf_prog_put(old_prog);
@@ -140,6 +235,7 @@ static void bpf_netns_link_show_fdinfo(const struct bpf_link *link,
static const struct bpf_link_ops bpf_netns_link_ops = {
.release = bpf_netns_link_release,
.dealloc = bpf_netns_link_dealloc,
+ .detach = bpf_netns_link_detach,
.update_prog = bpf_netns_link_update_prog,
.fill_link_info = bpf_netns_link_fill_info,
.show_fdinfo = bpf_netns_link_show_fdinfo,
@@ -309,18 +405,30 @@ int netns_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
return ret;
}
+static int netns_bpf_max_progs(enum netns_bpf_attach_type type)
+{
+ switch (type) {
+ case NETNS_BPF_FLOW_DISSECTOR:
+ return 1;
+ case NETNS_BPF_SK_LOOKUP:
+ return 64;
+ default:
+ return 0;
+ }
+}
+
static int netns_bpf_link_attach(struct net *net, struct bpf_link *link,
enum netns_bpf_attach_type type)
{
struct bpf_netns_link *net_link =
container_of(link, struct bpf_netns_link, link);
struct bpf_prog_array *run_array;
- int err;
+ int cnt, err;
mutex_lock(&netns_bpf_mutex);
- /* Allow attaching only one prog or link for now */
- if (!list_empty(&net->bpf.links[type])) {
+ cnt = link_count(net, type);
+ if (cnt >= netns_bpf_max_progs(type)) {
err = -E2BIG;
goto out_unlock;
}
@@ -334,6 +442,9 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link,
case NETNS_BPF_FLOW_DISSECTOR:
err = flow_dissector_bpf_prog_attach_check(net, link->prog);
break;
+ case NETNS_BPF_SK_LOOKUP:
+ err = 0; /* nothing to check */
+ break;
default:
err = -EINVAL;
break;
@@ -341,16 +452,22 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link,
if (err)
goto out_unlock;
- run_array = bpf_prog_array_alloc(1, GFP_KERNEL);
+ run_array = bpf_prog_array_alloc(cnt + 1, GFP_KERNEL);
if (!run_array) {
err = -ENOMEM;
goto out_unlock;
}
- run_array->items[0].prog = link->prog;
- rcu_assign_pointer(net->bpf.run_array[type], run_array);
list_add_tail(&net_link->node, &net->bpf.links[type]);
+ fill_prog_array(net, type, run_array);
+ run_array = rcu_replace_pointer(net->bpf.run_array[type], run_array,
+ lockdep_is_held(&netns_bpf_mutex));
+ bpf_prog_array_free(run_array);
+
+ /* Mark attach point as used */
+ netns_bpf_attach_type_need(type);
+
out_unlock:
mutex_unlock(&netns_bpf_mutex);
return err;
@@ -426,8 +543,10 @@ static void __net_exit netns_bpf_pernet_pre_exit(struct net *net)
mutex_lock(&netns_bpf_mutex);
for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) {
netns_bpf_run_array_detach(net, type);
- list_for_each_entry(net_link, &net->bpf.links[type], node)
+ list_for_each_entry(net_link, &net->bpf.links[type], node) {
net_link->net = NULL; /* auto-detach link */
+ netns_bpf_attach_type_unneed(type);
+ }
if (net->bpf.progs[type])
bpf_prog_put(net->bpf.progs[type]);
}
diff --git a/kernel/bpf/prog_iter.c b/kernel/bpf/prog_iter.c
new file mode 100644
index 000000000000..53a73c841c13
--- /dev/null
+++ b/kernel/bpf/prog_iter.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+#include <linux/bpf.h>
+#include <linux/fs.h>
+#include <linux/filter.h>
+#include <linux/kernel.h>
+#include <linux/btf_ids.h>
+
+struct bpf_iter_seq_prog_info {
+ u32 prog_id;
+};
+
+static void *bpf_prog_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_prog_info *info = seq->private;
+ struct bpf_prog *prog;
+
+ prog = bpf_prog_get_curr_or_next(&info->prog_id);
+ if (!prog)
+ return NULL;
+
+ if (*pos == 0)
+ ++*pos;
+ return prog;
+}
+
+static void *bpf_prog_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_prog_info *info = seq->private;
+
+ ++*pos;
+ ++info->prog_id;
+ bpf_prog_put((struct bpf_prog *)v);
+ return bpf_prog_get_curr_or_next(&info->prog_id);
+}
+
+struct bpf_iter__bpf_prog {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct bpf_prog *, prog);
+};
+
+DEFINE_BPF_ITER_FUNC(bpf_prog, struct bpf_iter_meta *meta, struct bpf_prog *prog)
+
+static int __bpf_prog_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+ struct bpf_iter__bpf_prog ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ ctx.meta = &meta;
+ ctx.prog = v;
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (prog)
+ ret = bpf_iter_run_prog(prog, &ctx);
+
+ return ret;
+}
+
+static int bpf_prog_seq_show(struct seq_file *seq, void *v)
+{
+ return __bpf_prog_seq_show(seq, v, false);
+}
+
+static void bpf_prog_seq_stop(struct seq_file *seq, void *v)
+{
+ if (!v)
+ (void)__bpf_prog_seq_show(seq, v, true);
+ else
+ bpf_prog_put((struct bpf_prog *)v);
+}
+
+static const struct seq_operations bpf_prog_seq_ops = {
+ .start = bpf_prog_seq_start,
+ .next = bpf_prog_seq_next,
+ .stop = bpf_prog_seq_stop,
+ .show = bpf_prog_seq_show,
+};
+
+BTF_ID_LIST(btf_bpf_prog_id)
+BTF_ID(struct, bpf_prog)
+
+static const struct bpf_iter_seq_info bpf_prog_seq_info = {
+ .seq_ops = &bpf_prog_seq_ops,
+ .init_seq_private = NULL,
+ .fini_seq_private = NULL,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_prog_info),
+};
+
+static struct bpf_iter_reg bpf_prog_reg_info = {
+ .target = "bpf_prog",
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__bpf_prog, prog),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &bpf_prog_seq_info,
+};
+
+static int __init bpf_prog_iter_init(void)
+{
+ bpf_prog_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_prog_id;
+ return bpf_iter_reg_target(&bpf_prog_reg_info);
+}
+
+late_initcall(bpf_prog_iter_init);
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 05c8e043b9d2..44184f82916a 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -101,13 +101,6 @@ static void queue_stack_map_free(struct bpf_map *map)
{
struct bpf_queue_stack *qs = bpf_queue_stack(map);
- /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
- * so the programs (can be more than one that used this map) were
- * disconnected from events. Wait for outstanding critical sections in
- * these programs to complete
- */
- synchronize_rcu();
-
bpf_map_area_free(qs);
}
@@ -262,6 +255,7 @@ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key,
return -EINVAL;
}
+static int queue_map_btf_id;
const struct bpf_map_ops queue_map_ops = {
.map_alloc_check = queue_stack_map_alloc_check,
.map_alloc = queue_stack_map_alloc,
@@ -273,8 +267,11 @@ const struct bpf_map_ops queue_map_ops = {
.map_pop_elem = queue_map_pop_elem,
.map_peek_elem = queue_map_peek_elem,
.map_get_next_key = queue_stack_map_get_next_key,
+ .map_btf_name = "bpf_queue_stack",
+ .map_btf_id = &queue_map_btf_id,
};
+static int stack_map_btf_id;
const struct bpf_map_ops stack_map_ops = {
.map_alloc_check = queue_stack_map_alloc_check,
.map_alloc = queue_stack_map_alloc,
@@ -286,4 +283,6 @@ const struct bpf_map_ops stack_map_ops = {
.map_pop_elem = stack_map_pop_elem,
.map_peek_elem = stack_map_peek_elem,
.map_get_next_key = queue_stack_map_get_next_key,
+ .map_btf_name = "bpf_queue_stack",
+ .map_btf_id = &stack_map_btf_id,
};
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index cae9d505e04a..90b29c5b1da7 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -99,8 +99,6 @@ static void reuseport_array_free(struct bpf_map *map)
struct sock *sk;
u32 i;
- synchronize_rcu();
-
/*
* ops->map_*_elem() will not be able to access this
* array now. Hence, this function only races with
@@ -351,6 +349,7 @@ static int reuseport_array_get_next_key(struct bpf_map *map, void *key,
return 0;
}
+static int reuseport_array_map_btf_id;
const struct bpf_map_ops reuseport_array_ops = {
.map_alloc_check = reuseport_array_alloc_check,
.map_alloc = reuseport_array_alloc,
@@ -358,4 +357,6 @@ const struct bpf_map_ops reuseport_array_ops = {
.map_lookup_elem = reuseport_array_lookup_elem,
.map_get_next_key = reuseport_array_get_next_key,
.map_delete_elem = reuseport_array_delete_elem,
+ .map_btf_name = "reuseport_array",
+ .map_btf_id = &reuseport_array_map_btf_id,
};
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 0af88bbc1c15..002f8a5c9e51 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -213,13 +213,6 @@ static void ringbuf_map_free(struct bpf_map *map)
{
struct bpf_ringbuf_map *rb_map;
- /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
- * so the programs (can be more than one that used this map) were
- * disconnected from events. Wait for outstanding critical sections in
- * these programs to complete
- */
- synchronize_rcu();
-
rb_map = container_of(map, struct bpf_ringbuf_map, map);
bpf_ringbuf_free(rb_map->rb);
kfree(rb_map);
@@ -292,6 +285,7 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
return 0;
}
+static int ringbuf_map_btf_id;
const struct bpf_map_ops ringbuf_map_ops = {
.map_alloc = ringbuf_map_alloc,
.map_free = ringbuf_map_free,
@@ -301,6 +295,8 @@ const struct bpf_map_ops ringbuf_map_ops = {
.map_update_elem = ringbuf_map_update_elem,
.map_delete_elem = ringbuf_map_delete_elem,
.map_get_next_key = ringbuf_map_get_next_key,
+ .map_btf_name = "bpf_ringbuf_map",
+ .map_btf_id = &ringbuf_map_btf_id,
};
/* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself,
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 599488f25e40..4fd830a62be2 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -4,11 +4,13 @@
#include <linux/bpf.h>
#include <linux/jhash.h>
#include <linux/filter.h>
+#include <linux/kernel.h>
#include <linux/stacktrace.h>
#include <linux/perf_event.h>
#include <linux/elf.h>
#include <linux/pagemap.h>
#include <linux/irq_work.h>
+#include <linux/btf_ids.h>
#include "percpu_freelist.h"
#define STACK_CREATE_FLAG_MASK \
@@ -348,11 +350,48 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
}
}
-BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
- u64, flags)
+static struct perf_callchain_entry *
+get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
+{
+#ifdef CONFIG_STACKTRACE
+ struct perf_callchain_entry *entry;
+ int rctx;
+
+ entry = get_callchain_entry(&rctx);
+
+ if (!entry)
+ return NULL;
+
+ entry->nr = init_nr +
+ stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr),
+ sysctl_perf_event_max_stack - init_nr, 0);
+
+ /* stack_trace_save_tsk() works on unsigned long array, while
+ * perf_callchain_entry uses u64 array. For 32-bit systems, it is
+ * necessary to fix this mismatch.
+ */
+ if (__BITS_PER_LONG != 64) {
+ unsigned long *from = (unsigned long *) entry->ip;
+ u64 *to = entry->ip;
+ int i;
+
+ /* copy data from the end to avoid using extra buffer */
+ for (i = entry->nr - 1; i >= (int)init_nr; i--)
+ to[i] = (u64)(from[i]);
+ }
+
+ put_callchain_entry(rctx);
+
+ return entry;
+#else /* CONFIG_STACKTRACE */
+ return NULL;
+#endif
+}
+
+static long __bpf_get_stackid(struct bpf_map *map,
+ struct perf_callchain_entry *trace, u64 flags)
{
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
- struct perf_callchain_entry *trace;
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
u32 max_depth = map->value_size / stack_map_data_size(map);
/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
@@ -360,21 +399,9 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
u32 hash, id, trace_nr, trace_len;
bool user = flags & BPF_F_USER_STACK;
- bool kernel = !user;
u64 *ips;
bool hash_matches;
- if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
- BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
- return -EINVAL;
-
- trace = get_perf_callchain(regs, init_nr, kernel, user,
- sysctl_perf_event_max_stack, false, false);
-
- if (unlikely(!trace))
- /* couldn't fetch the stack trace */
- return -EFAULT;
-
/* get_perf_callchain() guarantees that trace->nr >= init_nr
* and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
*/
@@ -439,6 +466,30 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
return id;
}
+BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
+ u64, flags)
+{
+ u32 max_depth = map->value_size / stack_map_data_size(map);
+ /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
+ u32 init_nr = sysctl_perf_event_max_stack - max_depth;
+ bool user = flags & BPF_F_USER_STACK;
+ struct perf_callchain_entry *trace;
+ bool kernel = !user;
+
+ if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+ BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
+ return -EINVAL;
+
+ trace = get_perf_callchain(regs, init_nr, kernel, user,
+ sysctl_perf_event_max_stack, false, false);
+
+ if (unlikely(!trace))
+ /* couldn't fetch the stack trace */
+ return -EFAULT;
+
+ return __bpf_get_stackid(map, trace, flags);
+}
+
const struct bpf_func_proto bpf_get_stackid_proto = {
.func = bpf_get_stackid,
.gpl_only = true,
@@ -448,8 +499,78 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
.arg3_type = ARG_ANYTHING,
};
-BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
- u64, flags)
+static __u64 count_kernel_ip(struct perf_callchain_entry *trace)
+{
+ __u64 nr_kernel = 0;
+
+ while (nr_kernel < trace->nr) {
+ if (trace->ip[nr_kernel] == PERF_CONTEXT_USER)
+ break;
+ nr_kernel++;
+ }
+ return nr_kernel;
+}
+
+BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
+ struct bpf_map *, map, u64, flags)
+{
+ struct perf_event *event = ctx->event;
+ struct perf_callchain_entry *trace;
+ bool kernel, user;
+ __u64 nr_kernel;
+ int ret;
+
+ /* perf_sample_data doesn't have callchain, use bpf_get_stackid */
+ if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
+ return bpf_get_stackid((unsigned long)(ctx->regs),
+ (unsigned long) map, flags, 0, 0);
+
+ if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+ BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
+ return -EINVAL;
+
+ user = flags & BPF_F_USER_STACK;
+ kernel = !user;
+
+ trace = ctx->data->callchain;
+ if (unlikely(!trace))
+ return -EFAULT;
+
+ nr_kernel = count_kernel_ip(trace);
+
+ if (kernel) {
+ __u64 nr = trace->nr;
+
+ trace->nr = nr_kernel;
+ ret = __bpf_get_stackid(map, trace, flags);
+
+ /* restore nr */
+ trace->nr = nr;
+ } else { /* user */
+ u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
+
+ skip += nr_kernel;
+ if (skip > BPF_F_SKIP_FIELD_MASK)
+ return -EFAULT;
+
+ flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
+ ret = __bpf_get_stackid(map, trace, flags);
+ }
+ return ret;
+}
+
+const struct bpf_func_proto bpf_get_stackid_proto_pe = {
+ .func = bpf_get_stackid_pe,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
+ struct perf_callchain_entry *trace_in,
+ void *buf, u32 size, u64 flags)
{
u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
bool user_build_id = flags & BPF_F_USER_BUILD_ID;
@@ -471,13 +592,24 @@ BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
if (unlikely(size % elem_size))
goto clear;
+ /* cannot get valid user stack for task without user_mode regs */
+ if (task && user && !user_mode(regs))
+ goto err_fault;
+
num_elem = size / elem_size;
if (sysctl_perf_event_max_stack < num_elem)
init_nr = 0;
else
init_nr = sysctl_perf_event_max_stack - num_elem;
- trace = get_perf_callchain(regs, init_nr, kernel, user,
- sysctl_perf_event_max_stack, false, false);
+
+ if (trace_in)
+ trace = trace_in;
+ else if (kernel && task)
+ trace = get_callchain_entry_for_task(task, init_nr);
+ else
+ trace = get_perf_callchain(regs, init_nr, kernel, user,
+ sysctl_perf_event_max_stack,
+ false, false);
if (unlikely(!trace))
goto err_fault;
@@ -505,6 +637,12 @@ clear:
return err;
}
+BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
+ u64, flags)
+{
+ return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
+}
+
const struct bpf_func_proto bpf_get_stack_proto = {
.func = bpf_get_stack,
.gpl_only = true,
@@ -515,6 +653,91 @@ const struct bpf_func_proto bpf_get_stack_proto = {
.arg4_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
+ u32, size, u64, flags)
+{
+ struct pt_regs *regs = task_pt_regs(task);
+
+ return __bpf_get_stack(regs, task, NULL, buf, size, flags);
+}
+
+BTF_ID_LIST(bpf_get_task_stack_btf_ids)
+BTF_ID(struct, task_struct)
+
+const struct bpf_func_proto bpf_get_task_stack_proto = {
+ .func = bpf_get_task_stack,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+ .btf_id = bpf_get_task_stack_btf_ids,
+};
+
+BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
+ void *, buf, u32, size, u64, flags)
+{
+ struct pt_regs *regs = (struct pt_regs *)(ctx->regs);
+ struct perf_event *event = ctx->event;
+ struct perf_callchain_entry *trace;
+ bool kernel, user;
+ int err = -EINVAL;
+ __u64 nr_kernel;
+
+ if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
+ return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
+
+ if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+ BPF_F_USER_BUILD_ID)))
+ goto clear;
+
+ user = flags & BPF_F_USER_STACK;
+ kernel = !user;
+
+ err = -EFAULT;
+ trace = ctx->data->callchain;
+ if (unlikely(!trace))
+ goto clear;
+
+ nr_kernel = count_kernel_ip(trace);
+
+ if (kernel) {
+ __u64 nr = trace->nr;
+
+ trace->nr = nr_kernel;
+ err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
+
+ /* restore nr */
+ trace->nr = nr;
+ } else { /* user */
+ u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
+
+ skip += nr_kernel;
+ if (skip > BPF_F_SKIP_FIELD_MASK)
+ goto clear;
+
+ flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
+ err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
+ }
+ return err;
+
+clear:
+ memset(buf, 0, size);
+ return err;
+
+}
+
+const struct bpf_func_proto bpf_get_stack_proto_pe = {
+ .func = bpf_get_stack_pe,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
/* Called from eBPF program */
static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
{
@@ -604,15 +827,13 @@ static void stack_map_free(struct bpf_map *map)
{
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
- /* wait for bpf programs to complete before freeing stack map */
- synchronize_rcu();
-
bpf_map_area_free(smap->elems);
pcpu_freelist_destroy(&smap->freelist);
bpf_map_area_free(smap);
put_callchain_buffers();
}
+static int stack_trace_map_btf_id;
const struct bpf_map_ops stack_trace_map_ops = {
.map_alloc = stack_map_alloc,
.map_free = stack_map_free,
@@ -621,6 +842,8 @@ const struct bpf_map_ops stack_trace_map_ops = {
.map_update_elem = stack_map_update_elem,
.map_delete_elem = stack_map_delete_elem,
.map_check_btf = map_check_no_btf,
+ .map_btf_name = "bpf_stack_map",
+ .map_btf_id = &stack_trace_map_btf_id,
};
static int __init stack_map_init(void)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0fd80ac81f70..86299a292214 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1981,6 +1981,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
case BPF_PROG_TYPE_CGROUP_SOCK:
switch (expected_attach_type) {
case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_INET_SOCK_RELEASE:
case BPF_CGROUP_INET4_POST_BIND:
case BPF_CGROUP_INET6_POST_BIND:
return 0;
@@ -2021,6 +2022,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
default:
return -EINVAL;
}
+ case BPF_PROG_TYPE_SK_LOOKUP:
+ if (expected_attach_type == BPF_SK_LOOKUP)
+ return 0;
+ return -EINVAL;
case BPF_PROG_TYPE_EXT:
if (expected_attach_type)
return -EINVAL;
@@ -2755,6 +2760,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ case BPF_PROG_TYPE_SK_LOOKUP:
return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
case BPF_PROG_TYPE_CGROUP_SKB:
if (!capable(CAP_NET_ADMIN))
@@ -2779,6 +2785,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
return BPF_PROG_TYPE_CGROUP_SKB;
break;
case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_INET_SOCK_RELEASE:
case BPF_CGROUP_INET4_POST_BIND:
case BPF_CGROUP_INET6_POST_BIND:
return BPF_PROG_TYPE_CGROUP_SOCK;
@@ -2815,6 +2822,10 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
return BPF_PROG_TYPE_CGROUP_SOCKOPT;
case BPF_TRACE_ITER:
return BPF_PROG_TYPE_TRACING;
+ case BPF_SK_LOOKUP:
+ return BPF_PROG_TYPE_SK_LOOKUP;
+ case BPF_XDP:
+ return BPF_PROG_TYPE_XDP;
default:
return BPF_PROG_TYPE_UNSPEC;
}
@@ -2927,6 +2938,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_CGROUP_INET_INGRESS:
case BPF_CGROUP_INET_EGRESS:
case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_INET_SOCK_RELEASE:
case BPF_CGROUP_INET4_BIND:
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_POST_BIND:
@@ -2950,6 +2962,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_LIRC_MODE2:
return lirc_prog_query(attr, uattr);
case BPF_FLOW_DISSECTOR:
+ case BPF_SK_LOOKUP:
return netns_bpf_prog_query(attr, uattr);
default:
return -EINVAL;
@@ -3033,6 +3046,25 @@ again:
return map;
}
+struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id)
+{
+ struct bpf_prog *prog;
+
+ spin_lock_bh(&prog_idr_lock);
+again:
+ prog = idr_get_next(&prog_idr, id);
+ if (prog) {
+ prog = bpf_prog_inc_not_zero(prog);
+ if (IS_ERR(prog)) {
+ (*id)++;
+ goto again;
+ }
+ }
+ spin_unlock_bh(&prog_idr_lock);
+
+ return prog;
+}
+
#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
struct bpf_prog *bpf_prog_by_id(u32 id)
@@ -3851,7 +3883,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *
return -EINVAL;
}
-#define BPF_LINK_CREATE_LAST_FIELD link_create.flags
+#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len
static int link_create(union bpf_attr *attr)
{
enum bpf_prog_type ptype;
@@ -3888,8 +3920,14 @@ static int link_create(union bpf_attr *attr)
ret = tracing_bpf_link_attach(attr, prog);
break;
case BPF_PROG_TYPE_FLOW_DISSECTOR:
+ case BPF_PROG_TYPE_SK_LOOKUP:
ret = netns_bpf_link_create(attr, prog);
break;
+#ifdef CONFIG_NET
+ case BPF_PROG_TYPE_XDP:
+ ret = bpf_xdp_link_attach(attr, prog);
+ break;
+#endif
default:
ret = -EINVAL;
}
@@ -3953,6 +3991,29 @@ out_put_link:
return ret;
}
+#define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd
+
+static int link_detach(union bpf_attr *attr)
+{
+ struct bpf_link *link;
+ int ret;
+
+ if (CHECK_ATTR(BPF_LINK_DETACH))
+ return -EINVAL;
+
+ link = bpf_link_get_from_fd(attr->link_detach.link_fd);
+ if (IS_ERR(link))
+ return PTR_ERR(link);
+
+ if (link->ops->detach)
+ ret = link->ops->detach(link);
+ else
+ ret = -EOPNOTSUPP;
+
+ bpf_link_put(link);
+ return ret;
+}
+
static int bpf_link_inc_not_zero(struct bpf_link *link)
{
return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? 0 : -ENOENT;
@@ -4202,6 +4263,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_ITER_CREATE:
err = bpf_iter_create(&attr);
break;
+ case BPF_LINK_DETACH:
+ err = link_detach(&attr);
+ break;
default:
err = -EINVAL;
break;
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 4dbf2b6035f8..232df29793e9 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -7,6 +7,7 @@
#include <linux/fs.h>
#include <linux/fdtable.h>
#include <linux/filter.h>
+#include <linux/btf_ids.h>
struct bpf_iter_seq_task_common {
struct pid_namespace *ns;
@@ -50,7 +51,8 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos)
if (!task)
return NULL;
- ++*pos;
+ if (*pos == 0)
+ ++*pos;
return task;
}
@@ -209,7 +211,8 @@ static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
return NULL;
}
- ++*pos;
+ if (*pos == 0)
+ ++*pos;
info->task = task;
info->files = files;
@@ -290,7 +293,7 @@ static void task_file_seq_stop(struct seq_file *seq, void *v)
}
}
-static int init_seq_pidns(void *priv_data)
+static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
{
struct bpf_iter_seq_task_common *common = priv_data;
@@ -312,25 +315,36 @@ static const struct seq_operations task_file_seq_ops = {
.show = task_file_seq_show,
};
-static const struct bpf_iter_reg task_reg_info = {
- .target = "task",
+BTF_ID_LIST(btf_task_file_ids)
+BTF_ID(struct, task_struct)
+BTF_ID(struct, file)
+
+static const struct bpf_iter_seq_info task_seq_info = {
.seq_ops = &task_seq_ops,
.init_seq_private = init_seq_pidns,
.fini_seq_private = fini_seq_pidns,
.seq_priv_size = sizeof(struct bpf_iter_seq_task_info),
+};
+
+static struct bpf_iter_reg task_reg_info = {
+ .target = "task",
.ctx_arg_info_size = 1,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__task, task),
PTR_TO_BTF_ID_OR_NULL },
},
+ .seq_info = &task_seq_info,
};
-static const struct bpf_iter_reg task_file_reg_info = {
- .target = "task_file",
+static const struct bpf_iter_seq_info task_file_seq_info = {
.seq_ops = &task_file_seq_ops,
.init_seq_private = init_seq_pidns,
.fini_seq_private = fini_seq_pidns,
.seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info),
+};
+
+static struct bpf_iter_reg task_file_reg_info = {
+ .target = "task_file",
.ctx_arg_info_size = 2,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__task_file, task),
@@ -338,16 +352,20 @@ static const struct bpf_iter_reg task_file_reg_info = {
{ offsetof(struct bpf_iter__task_file, file),
PTR_TO_BTF_ID_OR_NULL },
},
+ .seq_info = &task_file_seq_info,
};
static int __init task_iter_init(void)
{
int ret;
+ task_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0];
ret = bpf_iter_reg_target(&task_reg_info);
if (ret)
return ret;
+ task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0];
+ task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1];
return bpf_iter_reg_target(&task_file_reg_info);
}
late_initcall(task_iter_init);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 94cead5a43e5..ef938f17b944 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -409,7 +409,9 @@ static bool reg_type_may_be_null(enum bpf_reg_type type)
type == PTR_TO_SOCK_COMMON_OR_NULL ||
type == PTR_TO_TCP_SOCK_OR_NULL ||
type == PTR_TO_BTF_ID_OR_NULL ||
- type == PTR_TO_MEM_OR_NULL;
+ type == PTR_TO_MEM_OR_NULL ||
+ type == PTR_TO_RDONLY_BUF_OR_NULL ||
+ type == PTR_TO_RDWR_BUF_OR_NULL;
}
static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
@@ -503,6 +505,10 @@ static const char * const reg_type_str[] = {
[PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_",
[PTR_TO_MEM] = "mem",
[PTR_TO_MEM_OR_NULL] = "mem_or_null",
+ [PTR_TO_RDONLY_BUF] = "rdonly_buf",
+ [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null",
+ [PTR_TO_RDWR_BUF] = "rdwr_buf",
+ [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null",
};
static char slot_type_char[] = {
@@ -1350,6 +1356,19 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
__mark_reg_not_init(env, regs + regno);
}
+static void mark_btf_ld_reg(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, u32 regno,
+ enum bpf_reg_type reg_type, u32 btf_id)
+{
+ if (reg_type == SCALAR_VALUE) {
+ mark_reg_unknown(env, regs, regno);
+ return;
+ }
+ mark_reg_known_zero(env, regs, regno);
+ regs[regno].type = PTR_TO_BTF_ID;
+ regs[regno].btf_id = btf_id;
+}
+
#define DEF_NOT_SUBREG (0)
static void init_reg_state(struct bpf_verifier_env *env,
struct bpf_func_state *state)
@@ -2160,6 +2179,10 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_XDP_SOCK:
case PTR_TO_BTF_ID:
case PTR_TO_BTF_ID_OR_NULL:
+ case PTR_TO_RDONLY_BUF:
+ case PTR_TO_RDONLY_BUF_OR_NULL:
+ case PTR_TO_RDWR_BUF:
+ case PTR_TO_RDWR_BUF_OR_NULL:
return true;
default:
return false;
@@ -3039,14 +3062,15 @@ int check_ctx_reg(struct bpf_verifier_env *env,
return 0;
}
-static int check_tp_buffer_access(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg,
- int regno, int off, int size)
+static int __check_buffer_access(struct bpf_verifier_env *env,
+ const char *buf_info,
+ const struct bpf_reg_state *reg,
+ int regno, int off, int size)
{
if (off < 0) {
verbose(env,
- "R%d invalid tracepoint buffer access: off=%d, size=%d",
- regno, off, size);
+ "R%d invalid %s buffer access: off=%d, size=%d\n",
+ regno, buf_info, off, size);
return -EACCES;
}
if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
@@ -3054,16 +3078,49 @@ static int check_tp_buffer_access(struct bpf_verifier_env *env,
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
verbose(env,
- "R%d invalid variable buffer offset: off=%d, var_off=%s",
+ "R%d invalid variable buffer offset: off=%d, var_off=%s\n",
regno, off, tn_buf);
return -EACCES;
}
+
+ return 0;
+}
+
+static int check_tp_buffer_access(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ int regno, int off, int size)
+{
+ int err;
+
+ err = __check_buffer_access(env, "tracepoint", reg, regno, off, size);
+ if (err)
+ return err;
+
if (off + size > env->prog->aux->max_tp_access)
env->prog->aux->max_tp_access = off + size;
return 0;
}
+static int check_buffer_access(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ int regno, int off, int size,
+ bool zero_size_allowed,
+ const char *buf_info,
+ u32 *max_access)
+{
+ int err;
+
+ err = __check_buffer_access(env, buf_info, reg, regno, off, size);
+ if (err)
+ return err;
+
+ if (off + size > *max_access)
+ *max_access = off + size;
+
+ return 0;
+}
+
/* BPF architecture zero extends alu32 ops into 64-bit registesr */
static void zext_32_to_64(struct bpf_reg_state *reg)
{
@@ -3181,19 +3238,68 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
if (ret < 0)
return ret;
- if (atype == BPF_READ && value_regno >= 0) {
- if (ret == SCALAR_VALUE) {
- mark_reg_unknown(env, regs, value_regno);
- return 0;
- }
- mark_reg_known_zero(env, regs, value_regno);
- regs[value_regno].type = PTR_TO_BTF_ID;
- regs[value_regno].btf_id = btf_id;
+ if (atype == BPF_READ && value_regno >= 0)
+ mark_btf_ld_reg(env, regs, value_regno, ret, btf_id);
+
+ return 0;
+}
+
+static int check_ptr_to_map_access(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs,
+ int regno, int off, int size,
+ enum bpf_access_type atype,
+ int value_regno)
+{
+ struct bpf_reg_state *reg = regs + regno;
+ struct bpf_map *map = reg->map_ptr;
+ const struct btf_type *t;
+ const char *tname;
+ u32 btf_id;
+ int ret;
+
+ if (!btf_vmlinux) {
+ verbose(env, "map_ptr access not supported without CONFIG_DEBUG_INFO_BTF\n");
+ return -ENOTSUPP;
+ }
+
+ if (!map->ops->map_btf_id || !*map->ops->map_btf_id) {
+ verbose(env, "map_ptr access not supported for map type %d\n",
+ map->map_type);
+ return -ENOTSUPP;
+ }
+
+ t = btf_type_by_id(btf_vmlinux, *map->ops->map_btf_id);
+ tname = btf_name_by_offset(btf_vmlinux, t->name_off);
+
+ if (!env->allow_ptr_to_map_access) {
+ verbose(env,
+ "%s access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
+ tname);
+ return -EPERM;
+ }
+
+ if (off < 0) {
+ verbose(env, "R%d is %s invalid negative access: off=%d\n",
+ regno, tname, off);
+ return -EACCES;
+ }
+
+ if (atype != BPF_READ) {
+ verbose(env, "only read from %s is supported\n", tname);
+ return -EACCES;
}
+ ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id);
+ if (ret < 0)
+ return ret;
+
+ if (value_regno >= 0)
+ mark_btf_ld_reg(env, regs, value_regno, ret, btf_id);
+
return 0;
}
+
/* check whether memory at (regno + off) is accessible for t = (read | write)
* if t==write, value_regno is a register which value is stored into memory
* if t==read, value_regno is a register which will receive the value from memory
@@ -3362,6 +3468,26 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
} else if (reg->type == PTR_TO_BTF_ID) {
err = check_ptr_to_btf_access(env, regs, regno, off, size, t,
value_regno);
+ } else if (reg->type == CONST_PTR_TO_MAP) {
+ err = check_ptr_to_map_access(env, regs, regno, off, size, t,
+ value_regno);
+ } else if (reg->type == PTR_TO_RDONLY_BUF) {
+ if (t == BPF_WRITE) {
+ verbose(env, "R%d cannot write into %s\n",
+ regno, reg_type_str[reg->type]);
+ return -EACCES;
+ }
+ err = check_buffer_access(env, reg, regno, off, size, false,
+ "rdonly",
+ &env->prog->aux->max_rdonly_access);
+ if (!err && value_regno >= 0)
+ mark_reg_unknown(env, regs, value_regno);
+ } else if (reg->type == PTR_TO_RDWR_BUF) {
+ err = check_buffer_access(env, reg, regno, off, size, false,
+ "rdwr",
+ &env->prog->aux->max_rdwr_access);
+ if (!err && t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown(env, regs, value_regno);
} else {
verbose(env, "R%d invalid mem access '%s'\n", regno,
reg_type_str[reg->type]);
@@ -3603,6 +3729,18 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
return check_mem_region_access(env, regno, reg->off,
access_size, reg->mem_size,
zero_size_allowed);
+ case PTR_TO_RDONLY_BUF:
+ if (meta && meta->raw_mode)
+ return -EACCES;
+ return check_buffer_access(env, reg, regno, reg->off,
+ access_size, zero_size_allowed,
+ "rdonly",
+ &env->prog->aux->max_rdonly_access);
+ case PTR_TO_RDWR_BUF:
+ return check_buffer_access(env, reg, regno, reg->off,
+ access_size, zero_size_allowed,
+ "rdwr",
+ &env->prog->aux->max_rdwr_access);
default: /* scalar_value|ptr_to_stack or invalid ptr */
return check_stack_boundary(env, regno, access_size,
zero_size_allowed, meta);
@@ -3734,12 +3872,14 @@ static int int_ptr_type_to_size(enum bpf_arg_type type)
return -EINVAL;
}
-static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
- enum bpf_arg_type arg_type,
- struct bpf_call_arg_meta *meta)
+static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
+ struct bpf_call_arg_meta *meta,
+ const struct bpf_func_proto *fn)
{
+ u32 regno = BPF_REG_1 + arg;
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
enum bpf_reg_type expected_type, type = reg->type;
+ enum bpf_arg_type arg_type = fn->arg_type[arg];
int err = 0;
if (arg_type == ARG_DONTCARE)
@@ -3811,17 +3951,28 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
}
meta->ref_obj_id = reg->ref_obj_id;
}
- } else if (arg_type == ARG_PTR_TO_SOCKET) {
+ } else if (arg_type == ARG_PTR_TO_SOCKET ||
+ arg_type == ARG_PTR_TO_SOCKET_OR_NULL) {
expected_type = PTR_TO_SOCKET;
- if (type != expected_type)
- goto err_type;
+ if (!(register_is_null(reg) &&
+ arg_type == ARG_PTR_TO_SOCKET_OR_NULL)) {
+ if (type != expected_type)
+ goto err_type;
+ }
} else if (arg_type == ARG_PTR_TO_BTF_ID) {
expected_type = PTR_TO_BTF_ID;
if (type != expected_type)
goto err_type;
- if (reg->btf_id != meta->btf_id) {
- verbose(env, "Helper has type %s got %s in R%d\n",
- kernel_type_name(meta->btf_id),
+ if (!fn->check_btf_id) {
+ if (reg->btf_id != meta->btf_id) {
+ verbose(env, "Helper has type %s got %s in R%d\n",
+ kernel_type_name(meta->btf_id),
+ kernel_type_name(reg->btf_id), regno);
+
+ return -EACCES;
+ }
+ } else if (!fn->check_btf_id(reg->btf_id, arg)) {
+ verbose(env, "Helper does not support %s in R%d\n",
kernel_type_name(reg->btf_id), regno);
return -EACCES;
@@ -3855,6 +4006,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
else if (!type_is_pkt_pointer(type) &&
type != PTR_TO_MAP_VALUE &&
type != PTR_TO_MEM &&
+ type != PTR_TO_RDONLY_BUF &&
+ type != PTR_TO_RDWR_BUF &&
type != expected_type)
goto err_type;
meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
@@ -4643,10 +4796,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
meta.func_id = func_id;
/* check args */
for (i = 0; i < 5; i++) {
- err = btf_resolve_helper_id(&env->log, fn, i);
- if (err > 0)
- meta.btf_id = err;
- err = check_func_arg(env, BPF_REG_1 + i, fn->arg_type[i], &meta);
+ if (!fn->check_btf_id) {
+ err = btf_resolve_helper_id(&env->log, fn, i);
+ if (err > 0)
+ meta.btf_id = err;
+ }
+ err = check_func_arg(env, i, &meta, fn);
if (err)
return err;
}
@@ -4749,6 +4904,18 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL;
regs[BPF_REG_0].id = ++env->id_gen;
regs[BPF_REG_0].mem_size = meta.mem_size;
+ } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) {
+ int ret_btf_id;
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL;
+ ret_btf_id = *fn->ret_btf_id;
+ if (ret_btf_id == 0) {
+ verbose(env, "invalid return type %d of func %s#%d\n",
+ fn->ret_type, func_id_name(func_id), func_id);
+ return -EINVAL;
+ }
+ regs[BPF_REG_0].btf_id = ret_btf_id;
} else {
verbose(env, "unknown return type %d of func %s#%d\n",
fn->ret_type, func_id_name(func_id), func_id);
@@ -4775,7 +4942,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
if (err)
return err;
- if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) {
+ if ((func_id == BPF_FUNC_get_stack ||
+ func_id == BPF_FUNC_get_task_stack) &&
+ !env->prog->has_callchain_buf) {
const char *err_str;
#ifdef CONFIG_PERF_EVENTS
@@ -4793,6 +4962,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
env->prog->has_callchain_buf = true;
}
+ if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack)
+ env->prog->call_get_stack = true;
+
if (changes_data)
clear_all_pkt_pointers(env);
return 0;
@@ -5030,6 +5202,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
if (BPF_CLASS(insn->code) != BPF_ALU64) {
/* 32-bit ALU ops on pointers produce (meaningless) scalars */
+ if (opcode == BPF_SUB && env->allow_ptr_leaks) {
+ __mark_reg_unknown(env, dst_reg);
+ return 0;
+ }
+
verbose(env,
"R%d 32-bit pointer arithmetic prohibited\n",
dst);
@@ -6707,6 +6884,10 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
reg->type = PTR_TO_BTF_ID;
} else if (reg->type == PTR_TO_MEM_OR_NULL) {
reg->type = PTR_TO_MEM;
+ } else if (reg->type == PTR_TO_RDONLY_BUF_OR_NULL) {
+ reg->type = PTR_TO_RDONLY_BUF;
+ } else if (reg->type == PTR_TO_RDWR_BUF_OR_NULL) {
+ reg->type = PTR_TO_RDWR_BUF;
}
if (is_null) {
/* We don't need id and ref_obj_id from this point
@@ -7259,6 +7440,9 @@ static int check_return_code(struct bpf_verifier_env *env)
return -ENOTSUPP;
}
break;
+ case BPF_PROG_TYPE_SK_LOOKUP:
+ range = tnum_range(SK_DROP, SK_PASS);
+ break;
case BPF_PROG_TYPE_EXT:
/* freplace program can return anything as its return value
* depends on the to-be-replaced kernel func or bpf program.
@@ -8110,7 +8294,7 @@ static bool stacksafe(struct bpf_func_state *old,
if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
cur->stack[spi].slot_type[i % BPF_REG_SIZE])
/* Ex: old explored (safe) state has STACK_SPILL in
- * this stack slot, but current has has STACK_MISC ->
+ * this stack slot, but current has STACK_MISC ->
* this verifier states are not equivalent,
* return false to continue verification of this path
*/
@@ -10953,6 +11137,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
env->strict_alignment = false;
env->allow_ptr_leaks = bpf_allow_ptr_leaks();
+ env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access();
env->bypass_spec_v1 = bpf_bypass_spec_v1();
env->bypass_spec_v4 = bpf_bypass_spec_v4();
env->bpf_capable = bpf_capable();
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 18175687133a..106e4500fd53 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -11,6 +11,8 @@
#include <asm/page.h>
#include <asm/sections.h>
+#include <crypto/sha.h>
+
/* vmcoreinfo stuff */
unsigned char *vmcoreinfo_data;
size_t vmcoreinfo_size;
@@ -376,6 +378,53 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void)
}
EXPORT_SYMBOL(paddr_vmcoreinfo_note);
+#define NOTES_SIZE (&__stop_notes - &__start_notes)
+#define BUILD_ID_MAX SHA1_DIGEST_SIZE
+#define NT_GNU_BUILD_ID 3
+
+struct elf_note_section {
+ struct elf_note n_hdr;
+ u8 n_data[];
+};
+
+/*
+ * Add build ID from .notes section as generated by the GNU ld(1)
+ * or LLVM lld(1) --build-id option.
+ */
+static void add_build_id_vmcoreinfo(void)
+{
+ char build_id[BUILD_ID_MAX * 2 + 1];
+ int n_remain = NOTES_SIZE;
+
+ while (n_remain >= sizeof(struct elf_note)) {
+ const struct elf_note_section *note_sec =
+ &__start_notes + NOTES_SIZE - n_remain;
+ const u32 n_namesz = note_sec->n_hdr.n_namesz;
+
+ if (note_sec->n_hdr.n_type == NT_GNU_BUILD_ID &&
+ n_namesz != 0 &&
+ !strcmp((char *)&note_sec->n_data[0], "GNU")) {
+ if (note_sec->n_hdr.n_descsz <= BUILD_ID_MAX) {
+ const u32 n_descsz = note_sec->n_hdr.n_descsz;
+ const u8 *s = &note_sec->n_data[n_namesz];
+
+ s = PTR_ALIGN(s, 4);
+ bin2hex(build_id, s, n_descsz);
+ build_id[2 * n_descsz] = '\0';
+ VMCOREINFO_BUILD_ID(build_id);
+ return;
+ }
+ pr_warn("Build ID is too large to include in vmcoreinfo: %u > %u\n",
+ note_sec->n_hdr.n_descsz,
+ BUILD_ID_MAX);
+ return;
+ }
+ n_remain -= sizeof(struct elf_note) +
+ ALIGN(note_sec->n_hdr.n_namesz, 4) +
+ ALIGN(note_sec->n_hdr.n_descsz, 4);
+ }
+}
+
static int __init crash_save_vmcoreinfo_init(void)
{
vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
@@ -394,6 +443,7 @@ static int __init crash_save_vmcoreinfo_init(void)
}
VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+ add_build_id_vmcoreinfo();
VMCOREINFO_PAGESIZE(PAGE_SIZE);
VMCOREINFO_SYMBOL(init_uts_ns);
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 9e5934780f41..b16dbc1bf056 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -1068,7 +1068,7 @@ static void kgdb_tasklet_bpt(unsigned long ing)
atomic_set(&kgdb_break_tasklet_var, 0);
}
-static DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0);
+static DECLARE_TASKLET_OLD(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt);
void kgdb_schedule_breakpoint(void)
{
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 683a799618ad..9d847ab851db 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -591,7 +591,7 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
int this_cpu, old_cpu;
char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
char *moreprompt = "more> ";
- unsigned long uninitialized_var(flags);
+ unsigned long flags;
/* Serialize kdb_printf if multiple cpus try to write at once.
* But if any cpu goes recursive in kdb, just print the output,
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 1da3f44f2565..847a9d1fa634 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -1,10 +1,24 @@
# SPDX-License-Identifier: GPL-2.0-only
+config NO_DMA
+ bool
+
config HAS_DMA
bool
depends on !NO_DMA
default y
+config DMA_OPS
+ bool
+
+#
+# IOMMU drivers that can bypass the IOMMU code and optionally use the direct
+# mapping fast path should select this option and set the dma_ops_bypass
+# flag in struct device where applicable
+#
+config DMA_OPS_BYPASS
+ bool
+
config NEED_SG_DMA_LENGTH
bool
@@ -60,6 +74,7 @@ config DMA_NONCOHERENT_CACHE_SYNC
config DMA_VIRT_OPS
bool
depends on HAS_DMA
+ select DMA_OPS
config SWIOTLB
bool
@@ -174,11 +189,6 @@ config DMA_API_DEBUG
drivers like double-freeing of DMA mappings or freeing mappings that
were never allocated.
- This also attempts to catch cases where a page owned by DMA is
- accessed by the cpu in a way that could cause data corruption. For
- example, this enables cow_user_page() to check that the source page is
- not undergoing DMA.
-
This option causes a performance degradation. Use only if you want to
debug device drivers and dma interactions.
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index 370f63344e9c..32c7c1942bbd 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_HAS_DMA) += mapping.o direct.o dummy.o
+obj-$(CONFIG_HAS_DMA) += mapping.o direct.o
+obj-$(CONFIG_DMA_OPS) += dummy.o
obj-$(CONFIG_DMA_CMA) += contiguous.o
obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o
obj-$(CONFIG_DMA_VIRT_OPS) += virt.o
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 15bc5026c485..cff7e60968b9 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -215,6 +215,13 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages,
return cma_release(dev_get_cma_area(dev), pages, count);
}
+static struct page *cma_alloc_aligned(struct cma *cma, size_t size, gfp_t gfp)
+{
+ unsigned int align = min(get_order(size), CONFIG_CMA_ALIGNMENT);
+
+ return cma_alloc(cma, size >> PAGE_SHIFT, align, gfp & __GFP_NOWARN);
+}
+
/**
* dma_alloc_contiguous() - allocate contiguous pages
* @dev: Pointer to device for which the allocation is performed.
@@ -231,24 +238,14 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages,
*/
struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
{
- size_t count = size >> PAGE_SHIFT;
- struct page *page = NULL;
- struct cma *cma = NULL;
-
- if (dev && dev->cma_area)
- cma = dev->cma_area;
- else if (count > 1)
- cma = dma_contiguous_default_area;
-
/* CMA can be used only in the context which permits sleeping */
- if (cma && gfpflags_allow_blocking(gfp)) {
- size_t align = get_order(size);
- size_t cma_align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT);
-
- page = cma_alloc(cma, count, cma_align, gfp & __GFP_NOWARN);
- }
-
- return page;
+ if (!gfpflags_allow_blocking(gfp))
+ return NULL;
+ if (dev->cma_area)
+ return cma_alloc_aligned(dev->cma_area, size, gfp);
+ if (size <= PAGE_SIZE || !dma_contiguous_default_area)
+ return NULL;
+ return cma_alloc_aligned(dma_contiguous_default_area, size, gfp);
}
/**
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 36c962a86bf2..8e9f7b301c6d 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -144,8 +144,12 @@ static const char *type2name[] = {
[dma_debug_resource] = "resource",
};
-static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE",
- "DMA_FROM_DEVICE", "DMA_NONE" };
+static const char *dir2name[] = {
+ [DMA_BIDIRECTIONAL] = "DMA_BIDIRECTIONAL",
+ [DMA_TO_DEVICE] = "DMA_TO_DEVICE",
+ [DMA_FROM_DEVICE] = "DMA_FROM_DEVICE",
+ [DMA_NONE] = "DMA_NONE",
+};
/*
* The access to some variables in this macro is racy. We can't use atomic_t
@@ -444,9 +448,6 @@ void debug_dma_dump_mappings(struct device *dev)
* dma_active_cacheline entry to track per event. dma_map_sg(), on the
* other hand, consumes a single dma_debug_entry, but inserts 'nents'
* entries into the tree.
- *
- * At any time debug_dma_assert_idle() can be called to trigger a
- * warning if any cachelines in the given page are in the active set.
*/
static RADIX_TREE(dma_active_cacheline, GFP_NOWAIT);
static DEFINE_SPINLOCK(radix_lock);
@@ -493,10 +494,7 @@ static void active_cacheline_inc_overlap(phys_addr_t cln)
overlap = active_cacheline_set_overlap(cln, ++overlap);
/* If we overflowed the overlap counter then we're potentially
- * leaking dma-mappings. Otherwise, if maps and unmaps are
- * balanced then this overflow may cause false negatives in
- * debug_dma_assert_idle() as the cacheline may be marked idle
- * prematurely.
+ * leaking dma-mappings.
*/
WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP,
pr_fmt("exceeded %d overlapping mappings of cacheline %pa\n"),
@@ -551,53 +549,6 @@ static void active_cacheline_remove(struct dma_debug_entry *entry)
spin_unlock_irqrestore(&radix_lock, flags);
}
-/**
- * debug_dma_assert_idle() - assert that a page is not undergoing dma
- * @page: page to lookup in the dma_active_cacheline tree
- *
- * Place a call to this routine in cases where the cpu touching the page
- * before the dma completes (page is dma_unmapped) will lead to data
- * corruption.
- */
-void debug_dma_assert_idle(struct page *page)
-{
- static struct dma_debug_entry *ents[CACHELINES_PER_PAGE];
- struct dma_debug_entry *entry = NULL;
- void **results = (void **) &ents;
- unsigned int nents, i;
- unsigned long flags;
- phys_addr_t cln;
-
- if (dma_debug_disabled())
- return;
-
- if (!page)
- return;
-
- cln = (phys_addr_t) page_to_pfn(page) << CACHELINE_PER_PAGE_SHIFT;
- spin_lock_irqsave(&radix_lock, flags);
- nents = radix_tree_gang_lookup(&dma_active_cacheline, results, cln,
- CACHELINES_PER_PAGE);
- for (i = 0; i < nents; i++) {
- phys_addr_t ent_cln = to_cacheline_number(ents[i]);
-
- if (ent_cln == cln) {
- entry = ents[i];
- break;
- } else if (ent_cln >= cln + CACHELINES_PER_PAGE)
- break;
- }
- spin_unlock_irqrestore(&radix_lock, flags);
-
- if (!entry)
- return;
-
- cln = to_cacheline_number(entry);
- err_printk(entry->dev, entry,
- "cpu touching an active dma mapped cacheline [cln=%pa]\n",
- &cln);
-}
-
/*
* Wrapper function for adding an entry to the hash.
* This function takes care of locking itself.
@@ -882,7 +833,7 @@ static int device_dma_allocations(struct device *dev, struct dma_debug_entry **o
static int dma_debug_device_change(struct notifier_block *nb, unsigned long action, void *data)
{
struct device *dev = data;
- struct dma_debug_entry *uninitialized_var(entry);
+ struct dma_debug_entry *entry;
int count;
if (dma_debug_disabled())
@@ -1071,7 +1022,7 @@ static void check_unmap(struct dma_debug_entry *ref)
/*
* Drivers should use dma_mapping_error() to check the returned
* addresses of dma_map_single() and dma_map_page().
- * If not, print this warning message. See Documentation/DMA-API.txt.
+ * If not, print this warning message. See Documentation/core-api/dma-api.rst.
*/
if (entry->map_err_type == MAP_ERR_NOT_CHECKED) {
err_printk(ref->dev, entry,
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 67f060b86a73..bb0041e99659 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -10,11 +10,9 @@
#include <linux/dma-direct.h>
#include <linux/scatterlist.h>
#include <linux/dma-contiguous.h>
-#include <linux/dma-noncoherent.h>
#include <linux/pfn.h>
#include <linux/vmalloc.h>
#include <linux/set_memory.h>
-#include <linux/swiotlb.h>
/*
* Most architectures use ZONE_DMA for the first 16 Megabytes, but some use it
@@ -304,19 +302,6 @@ void dma_direct_free(struct device *dev, size_t size,
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
defined(CONFIG_SWIOTLB)
-void dma_direct_sync_single_for_device(struct device *dev,
- dma_addr_t addr, size_t size, enum dma_data_direction dir)
-{
- phys_addr_t paddr = dma_to_phys(dev, addr);
-
- if (unlikely(is_swiotlb_buffer(paddr)))
- swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE);
-
- if (!dev_is_dma_coherent(dev))
- arch_sync_dma_for_device(paddr, size, dir);
-}
-EXPORT_SYMBOL(dma_direct_sync_single_for_device);
-
void dma_direct_sync_sg_for_device(struct device *dev,
struct scatterlist *sgl, int nents, enum dma_data_direction dir)
{
@@ -335,27 +320,11 @@ void dma_direct_sync_sg_for_device(struct device *dev,
dir);
}
}
-EXPORT_SYMBOL(dma_direct_sync_sg_for_device);
#endif
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
defined(CONFIG_SWIOTLB)
-void dma_direct_sync_single_for_cpu(struct device *dev,
- dma_addr_t addr, size_t size, enum dma_data_direction dir)
-{
- phys_addr_t paddr = dma_to_phys(dev, addr);
-
- if (!dev_is_dma_coherent(dev)) {
- arch_sync_dma_for_cpu(paddr, size, dir);
- arch_sync_dma_for_cpu_all();
- }
-
- if (unlikely(is_swiotlb_buffer(paddr)))
- swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
-}
-EXPORT_SYMBOL(dma_direct_sync_single_for_cpu);
-
void dma_direct_sync_sg_for_cpu(struct device *dev,
struct scatterlist *sgl, int nents, enum dma_data_direction dir)
{
@@ -376,20 +345,6 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_cpu_all();
}
-EXPORT_SYMBOL(dma_direct_sync_sg_for_cpu);
-
-void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
- size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
- phys_addr_t phys = dma_to_phys(dev, addr);
-
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- dma_direct_sync_single_for_cpu(dev, addr, size, dir);
-
- if (unlikely(is_swiotlb_buffer(phys)))
- swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
-}
-EXPORT_SYMBOL(dma_direct_unmap_page);
void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
int nents, enum dma_data_direction dir, unsigned long attrs)
@@ -401,35 +356,8 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
attrs);
}
-EXPORT_SYMBOL(dma_direct_unmap_sg);
#endif
-dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size, enum dma_data_direction dir,
- unsigned long attrs)
-{
- phys_addr_t phys = page_to_phys(page) + offset;
- dma_addr_t dma_addr = phys_to_dma(dev, phys);
-
- if (unlikely(swiotlb_force == SWIOTLB_FORCE))
- return swiotlb_map(dev, phys, size, dir, attrs);
-
- if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
- if (swiotlb_force != SWIOTLB_NO_FORCE)
- return swiotlb_map(dev, phys, size, dir, attrs);
-
- dev_WARN_ONCE(dev, 1,
- "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
- &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
- return DMA_MAPPING_ERROR;
- }
-
- if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- arch_sync_dma_for_device(phys, size, dir);
- return dma_addr;
-}
-EXPORT_SYMBOL(dma_direct_map_page);
-
int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
enum dma_data_direction dir, unsigned long attrs)
{
@@ -450,7 +378,6 @@ out_unmap:
dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
return 0;
}
-EXPORT_SYMBOL(dma_direct_map_sg);
dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
size_t size, enum dma_data_direction dir, unsigned long attrs)
@@ -467,7 +394,6 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
return dma_addr;
}
-EXPORT_SYMBOL(dma_direct_map_resource);
int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
void *cpu_addr, dma_addr_t dma_addr, size_t size,
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index a8c18c9a796f..0d129421e75f 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -105,6 +105,196 @@ void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
}
EXPORT_SYMBOL(dmam_alloc_attrs);
+static bool dma_go_direct(struct device *dev, dma_addr_t mask,
+ const struct dma_map_ops *ops)
+{
+ if (likely(!ops))
+ return true;
+#ifdef CONFIG_DMA_OPS_BYPASS
+ if (dev->dma_ops_bypass)
+ return min_not_zero(mask, dev->bus_dma_limit) >=
+ dma_direct_get_required_mask(dev);
+#endif
+ return false;
+}
+
+
+/*
+ * Check if the devices uses a direct mapping for streaming DMA operations.
+ * This allows IOMMU drivers to set a bypass mode if the DMA mask is large
+ * enough.
+ */
+static inline bool dma_alloc_direct(struct device *dev,
+ const struct dma_map_ops *ops)
+{
+ return dma_go_direct(dev, dev->coherent_dma_mask, ops);
+}
+
+static inline bool dma_map_direct(struct device *dev,
+ const struct dma_map_ops *ops)
+{
+ return dma_go_direct(dev, *dev->dma_mask, ops);
+}
+
+dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
+ size_t offset, size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ dma_addr_t addr;
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (dma_map_direct(dev, ops))
+ addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
+ else
+ addr = ops->map_page(dev, page, offset, size, dir, attrs);
+ debug_dma_map_page(dev, page, offset, size, dir, addr);
+
+ return addr;
+}
+EXPORT_SYMBOL(dma_map_page_attrs);
+
+void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (dma_map_direct(dev, ops))
+ dma_direct_unmap_page(dev, addr, size, dir, attrs);
+ else if (ops->unmap_page)
+ ops->unmap_page(dev, addr, size, dir, attrs);
+ debug_dma_unmap_page(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(dma_unmap_page_attrs);
+
+/*
+ * dma_maps_sg_attrs returns 0 on error and > 0 on success.
+ * It should never return a value < 0.
+ */
+int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ int ents;
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (dma_map_direct(dev, ops))
+ ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
+ else
+ ents = ops->map_sg(dev, sg, nents, dir, attrs);
+ BUG_ON(ents < 0);
+ debug_dma_map_sg(dev, sg, nents, ents, dir);
+
+ return ents;
+}
+EXPORT_SYMBOL(dma_map_sg_attrs);
+
+void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
+ int nents, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+ debug_dma_unmap_sg(dev, sg, nents, dir);
+ if (dma_map_direct(dev, ops))
+ dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
+ else if (ops->unmap_sg)
+ ops->unmap_sg(dev, sg, nents, dir, attrs);
+}
+EXPORT_SYMBOL(dma_unmap_sg_attrs);
+
+dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ dma_addr_t addr = DMA_MAPPING_ERROR;
+
+ BUG_ON(!valid_dma_direction(dir));
+
+ /* Don't allow RAM to be mapped */
+ if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr))))
+ return DMA_MAPPING_ERROR;
+
+ if (dma_map_direct(dev, ops))
+ addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
+ else if (ops->map_resource)
+ addr = ops->map_resource(dev, phys_addr, size, dir, attrs);
+
+ debug_dma_map_resource(dev, phys_addr, size, dir, addr);
+ return addr;
+}
+EXPORT_SYMBOL(dma_map_resource);
+
+void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (!dma_map_direct(dev, ops) && ops->unmap_resource)
+ ops->unmap_resource(dev, addr, size, dir, attrs);
+ debug_dma_unmap_resource(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(dma_unmap_resource);
+
+void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
+ enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (dma_map_direct(dev, ops))
+ dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+ else if (ops->sync_single_for_cpu)
+ ops->sync_single_for_cpu(dev, addr, size, dir);
+ debug_dma_sync_single_for_cpu(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(dma_sync_single_for_cpu);
+
+void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (dma_map_direct(dev, ops))
+ dma_direct_sync_single_for_device(dev, addr, size, dir);
+ else if (ops->sync_single_for_device)
+ ops->sync_single_for_device(dev, addr, size, dir);
+ debug_dma_sync_single_for_device(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(dma_sync_single_for_device);
+
+void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+ int nelems, enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (dma_map_direct(dev, ops))
+ dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir);
+ else if (ops->sync_sg_for_cpu)
+ ops->sync_sg_for_cpu(dev, sg, nelems, dir);
+ debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
+}
+EXPORT_SYMBOL(dma_sync_sg_for_cpu);
+
+void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+ int nelems, enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (dma_map_direct(dev, ops))
+ dma_direct_sync_sg_for_device(dev, sg, nelems, dir);
+ else if (ops->sync_sg_for_device)
+ ops->sync_sg_for_device(dev, sg, nelems, dir);
+ debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
+}
+EXPORT_SYMBOL(dma_sync_sg_for_device);
+
/*
* Create scatter-list for the already allocated DMA buffer.
*/
@@ -138,7 +328,7 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt,
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (dma_is_direct(ops))
+ if (dma_alloc_direct(dev, ops))
return dma_direct_get_sgtable(dev, sgt, cpu_addr, dma_addr,
size, attrs);
if (!ops->get_sgtable)
@@ -208,7 +398,7 @@ bool dma_can_mmap(struct device *dev)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (dma_is_direct(ops))
+ if (dma_alloc_direct(dev, ops))
return dma_direct_can_mmap(dev);
return ops->mmap != NULL;
}
@@ -233,7 +423,7 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (dma_is_direct(ops))
+ if (dma_alloc_direct(dev, ops))
return dma_direct_mmap(dev, vma, cpu_addr, dma_addr, size,
attrs);
if (!ops->mmap)
@@ -246,7 +436,7 @@ u64 dma_get_required_mask(struct device *dev)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (dma_is_direct(ops))
+ if (dma_alloc_direct(dev, ops))
return dma_direct_get_required_mask(dev);
if (ops->get_required_mask)
return ops->get_required_mask(dev);
@@ -277,7 +467,7 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
/* let the implementation decide on the zone to allocate from: */
flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
- if (dma_is_direct(ops))
+ if (dma_alloc_direct(dev, ops))
cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs);
else if (ops->alloc)
cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
@@ -309,7 +499,7 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
return;
debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
- if (dma_is_direct(ops))
+ if (dma_alloc_direct(dev, ops))
dma_direct_free(dev, size, cpu_addr, dma_handle, attrs);
else if (ops->free)
ops->free(dev, size, cpu_addr, dma_handle, attrs);
@@ -320,7 +510,11 @@ int dma_supported(struct device *dev, u64 mask)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (dma_is_direct(ops))
+ /*
+ * ->dma_supported sets the bypass flag, so we must always call
+ * into the method here unless the device is truly direct mapped.
+ */
+ if (!ops)
return dma_direct_supported(dev, mask);
if (!ops->dma_supported)
return 1;
@@ -376,7 +570,7 @@ void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
BUG_ON(!valid_dma_direction(dir));
- if (dma_is_direct(ops))
+ if (dma_alloc_direct(dev, ops))
arch_dma_cache_sync(dev, vaddr, size, dir);
else if (ops->cache_sync)
ops->cache_sync(dev, vaddr, size, dir);
@@ -388,7 +582,7 @@ size_t dma_max_mapping_size(struct device *dev)
const struct dma_map_ops *ops = get_dma_ops(dev);
size_t size = SIZE_MAX;
- if (dma_is_direct(ops))
+ if (dma_map_direct(dev, ops))
size = dma_direct_max_mapping_size(dev);
else if (ops && ops->max_mapping_size)
size = ops->max_mapping_size(dev);
@@ -401,7 +595,7 @@ bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (dma_is_direct(ops))
+ if (dma_map_direct(dev, ops))
return dma_direct_need_sync(dev, dma_addr);
return ops->sync_single_for_cpu || ops->sync_single_for_device;
}
diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile
new file mode 100644
index 000000000000..34c8a3f1c735
--- /dev/null
+++ b/kernel/entry/Makefile
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# Prevent the noinstr section from being pestered by sanitizer and other goodies
+# as long as these things cannot be disabled per function.
+KASAN_SANITIZE := n
+UBSAN_SANITIZE := n
+KCOV_INSTRUMENT := n
+
+CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong
+CFLAGS_common.o += -fno-stack-protector
+
+obj-$(CONFIG_GENERIC_ENTRY) += common.o
+obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK) += kvm.o
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
new file mode 100644
index 000000000000..9852e0d62d95
--- /dev/null
+++ b/kernel/entry/common.c
@@ -0,0 +1,374 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/context_tracking.h>
+#include <linux/entry-common.h>
+#include <linux/livepatch.h>
+#include <linux/audit.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
+/**
+ * enter_from_user_mode - Establish state when coming from user mode
+ *
+ * Syscall/interrupt entry disables interrupts, but user mode is traced as
+ * interrupts enabled. Also with NO_HZ_FULL RCU might be idle.
+ *
+ * 1) Tell lockdep that interrupts are disabled
+ * 2) Invoke context tracking if enabled to reactivate RCU
+ * 3) Trace interrupts off state
+ */
+static __always_inline void enter_from_user_mode(struct pt_regs *regs)
+{
+ arch_check_user_regs(regs);
+ lockdep_hardirqs_off(CALLER_ADDR0);
+
+ CT_WARN_ON(ct_state() != CONTEXT_USER);
+ user_exit_irqoff();
+
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ instrumentation_end();
+}
+
+static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
+{
+ if (unlikely(audit_context())) {
+ unsigned long args[6];
+
+ syscall_get_arguments(current, regs, args);
+ audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
+ }
+}
+
+static long syscall_trace_enter(struct pt_regs *regs, long syscall,
+ unsigned long ti_work)
+{
+ long ret = 0;
+
+ /* Handle ptrace */
+ if (ti_work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) {
+ ret = arch_syscall_enter_tracehook(regs);
+ if (ret || (ti_work & _TIF_SYSCALL_EMU))
+ return -1L;
+ }
+
+ /* Do seccomp after ptrace, to catch any tracer changes. */
+ if (ti_work & _TIF_SECCOMP) {
+ ret = __secure_computing(NULL);
+ if (ret == -1L)
+ return ret;
+ }
+
+ if (unlikely(ti_work & _TIF_SYSCALL_TRACEPOINT))
+ trace_sys_enter(regs, syscall);
+
+ syscall_enter_audit(regs, syscall);
+
+ return ret ? : syscall;
+}
+
+noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
+{
+ unsigned long ti_work;
+
+ enter_from_user_mode(regs);
+ instrumentation_begin();
+
+ local_irq_enable();
+ ti_work = READ_ONCE(current_thread_info()->flags);
+ if (ti_work & SYSCALL_ENTER_WORK)
+ syscall = syscall_trace_enter(regs, syscall, ti_work);
+ instrumentation_end();
+
+ return syscall;
+}
+
+/**
+ * exit_to_user_mode - Fixup state when exiting to user mode
+ *
+ * Syscall/interupt exit enables interrupts, but the kernel state is
+ * interrupts disabled when this is invoked. Also tell RCU about it.
+ *
+ * 1) Trace interrupts on state
+ * 2) Invoke context tracking if enabled to adjust RCU state
+ * 3) Invoke architecture specific last minute exit code, e.g. speculation
+ * mitigations, etc.
+ * 4) Tell lockdep that interrupts are enabled
+ */
+static __always_inline void exit_to_user_mode(void)
+{
+ instrumentation_begin();
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ instrumentation_end();
+
+ user_enter_irqoff();
+ arch_exit_to_user_mode();
+ lockdep_hardirqs_on(CALLER_ADDR0);
+}
+
+/* Workaround to allow gradual conversion of architecture code */
+void __weak arch_do_signal(struct pt_regs *regs) { }
+
+static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
+ unsigned long ti_work)
+{
+ /*
+ * Before returning to user space ensure that all pending work
+ * items have been completed.
+ */
+ while (ti_work & EXIT_TO_USER_MODE_WORK) {
+
+ local_irq_enable_exit_to_user(ti_work);
+
+ if (ti_work & _TIF_NEED_RESCHED)
+ schedule();
+
+ if (ti_work & _TIF_UPROBE)
+ uprobe_notify_resume(regs);
+
+ if (ti_work & _TIF_PATCH_PENDING)
+ klp_update_patch_state(current);
+
+ if (ti_work & _TIF_SIGPENDING)
+ arch_do_signal(regs);
+
+ if (ti_work & _TIF_NOTIFY_RESUME) {
+ clear_thread_flag(TIF_NOTIFY_RESUME);
+ tracehook_notify_resume(regs);
+ rseq_handle_notify_resume(NULL, regs);
+ }
+
+ /* Architecture specific TIF work */
+ arch_exit_to_user_mode_work(regs, ti_work);
+
+ /*
+ * Disable interrupts and reevaluate the work flags as they
+ * might have changed while interrupts and preemption was
+ * enabled above.
+ */
+ local_irq_disable_exit_to_user();
+ ti_work = READ_ONCE(current_thread_info()->flags);
+ }
+
+ /* Return the latest work state for arch_exit_to_user_mode() */
+ return ti_work;
+}
+
+static void exit_to_user_mode_prepare(struct pt_regs *regs)
+{
+ unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
+
+ lockdep_assert_irqs_disabled();
+
+ if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
+ ti_work = exit_to_user_mode_loop(regs, ti_work);
+
+ arch_exit_to_user_mode_prepare(regs, ti_work);
+
+ /* Ensure that the address limit is intact and no locks are held */
+ addr_limit_user_check();
+ lockdep_assert_irqs_disabled();
+ lockdep_sys_exit();
+}
+
+#ifndef _TIF_SINGLESTEP
+static inline bool report_single_step(unsigned long ti_work)
+{
+ return false;
+}
+#else
+/*
+ * If TIF_SYSCALL_EMU is set, then the only reason to report is when
+ * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
+ * instruction has been already reported in syscall_enter_from_usermode().
+ */
+#define SYSEMU_STEP (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)
+
+static inline bool report_single_step(unsigned long ti_work)
+{
+ return (ti_work & SYSEMU_STEP) == _TIF_SINGLESTEP;
+}
+#endif
+
+static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work)
+{
+ bool step;
+
+ audit_syscall_exit(regs);
+
+ if (ti_work & _TIF_SYSCALL_TRACEPOINT)
+ trace_sys_exit(regs, syscall_get_return_value(current, regs));
+
+ step = report_single_step(ti_work);
+ if (step || ti_work & _TIF_SYSCALL_TRACE)
+ arch_syscall_exit_tracehook(regs, step);
+}
+
+/*
+ * Syscall specific exit to user mode preparation. Runs with interrupts
+ * enabled.
+ */
+static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
+{
+ u32 cached_flags = READ_ONCE(current_thread_info()->flags);
+ unsigned long nr = syscall_get_nr(current, regs);
+
+ CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
+ if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
+ local_irq_enable();
+ }
+
+ rseq_syscall(regs);
+
+ /*
+ * Do one-time syscall specific work. If these work items are
+ * enabled, we want to run them exactly once per syscall exit with
+ * interrupts enabled.
+ */
+ if (unlikely(cached_flags & SYSCALL_EXIT_WORK))
+ syscall_exit_work(regs, cached_flags);
+}
+
+__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
+{
+ instrumentation_begin();
+ syscall_exit_to_user_mode_prepare(regs);
+ local_irq_disable_exit_to_user();
+ exit_to_user_mode_prepare(regs);
+ instrumentation_end();
+ exit_to_user_mode();
+}
+
+noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
+{
+ enter_from_user_mode(regs);
+}
+
+noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
+{
+ instrumentation_begin();
+ exit_to_user_mode_prepare(regs);
+ instrumentation_end();
+ exit_to_user_mode();
+}
+
+noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
+{
+ irqentry_state_t ret = {
+ .exit_rcu = false,
+ };
+
+ if (user_mode(regs)) {
+ irqentry_enter_from_user_mode(regs);
+ return ret;
+ }
+
+ /*
+ * If this entry hit the idle task invoke rcu_irq_enter() whether
+ * RCU is watching or not.
+ *
+ * Interupts can nest when the first interrupt invokes softirq
+ * processing on return which enables interrupts.
+ *
+ * Scheduler ticks in the idle task can mark quiescent state and
+ * terminate a grace period, if and only if the timer interrupt is
+ * not nested into another interrupt.
+ *
+ * Checking for __rcu_is_watching() here would prevent the nesting
+ * interrupt to invoke rcu_irq_enter(). If that nested interrupt is
+ * the tick then rcu_flavor_sched_clock_irq() would wrongfully
+ * assume that it is the first interupt and eventually claim
+ * quiescient state and end grace periods prematurely.
+ *
+ * Unconditionally invoke rcu_irq_enter() so RCU state stays
+ * consistent.
+ *
+ * TINY_RCU does not support EQS, so let the compiler eliminate
+ * this part when enabled.
+ */
+ if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) {
+ /*
+ * If RCU is not watching then the same careful
+ * sequence vs. lockdep and tracing is required
+ * as in irq_enter_from_user_mode().
+ */
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ rcu_irq_enter();
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ instrumentation_end();
+
+ ret.exit_rcu = true;
+ return ret;
+ }
+
+ /*
+ * If RCU is watching then RCU only wants to check whether it needs
+ * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
+ * already contains a warning when RCU is not watching, so no point
+ * in having another one here.
+ */
+ instrumentation_begin();
+ rcu_irq_enter_check_tick();
+ /* Use the combo lockdep/tracing function */
+ trace_hardirqs_off();
+ instrumentation_end();
+
+ return ret;
+}
+
+void irqentry_exit_cond_resched(void)
+{
+ if (!preempt_count()) {
+ /* Sanity check RCU and thread stack */
+ rcu_irq_exit_check_preempt();
+ if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
+ WARN_ON_ONCE(!on_thread_stack());
+ if (need_resched())
+ preempt_schedule_irq();
+ }
+}
+
+noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
+{
+ lockdep_assert_irqs_disabled();
+
+ /* Check whether this returns to user mode */
+ if (user_mode(regs)) {
+ irqentry_exit_to_user_mode(regs);
+ } else if (!regs_irqs_disabled(regs)) {
+ /*
+ * If RCU was not watching on entry this needs to be done
+ * carefully and needs the same ordering of lockdep/tracing
+ * and RCU as the return to user mode path.
+ */
+ if (state.exit_rcu) {
+ instrumentation_begin();
+ /* Tell the tracer that IRET will enable interrupts */
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ instrumentation_end();
+ rcu_irq_exit();
+ lockdep_hardirqs_on(CALLER_ADDR0);
+ return;
+ }
+
+ instrumentation_begin();
+ if (IS_ENABLED(CONFIG_PREEMPTION))
+ irqentry_exit_cond_resched();
+ /* Covers both tracing and lockdep */
+ trace_hardirqs_on();
+ instrumentation_end();
+ } else {
+ /*
+ * IRQ flags state is correct already. Just tell RCU if it
+ * was not watching on entry.
+ */
+ if (state.exit_rcu)
+ rcu_irq_exit();
+ }
+}
diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
new file mode 100644
index 000000000000..eb1a8a4c867c
--- /dev/null
+++ b/kernel/entry/kvm.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/entry-kvm.h>
+#include <linux/kvm_host.h>
+
+static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
+{
+ do {
+ int ret;
+
+ if (ti_work & _TIF_SIGPENDING) {
+ kvm_handle_signal_exit(vcpu);
+ return -EINTR;
+ }
+
+ if (ti_work & _TIF_NEED_RESCHED)
+ schedule();
+
+ if (ti_work & _TIF_NOTIFY_RESUME) {
+ clear_thread_flag(TIF_NOTIFY_RESUME);
+ tracehook_notify_resume(NULL);
+ }
+
+ ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work);
+ if (ret)
+ return ret;
+
+ ti_work = READ_ONCE(current_thread_info()->flags);
+ } while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched());
+ return 0;
+}
+
+int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu)
+{
+ unsigned long ti_work;
+
+ /*
+ * This is invoked from the outer guest loop with interrupts and
+ * preemption enabled.
+ *
+ * KVM invokes xfer_to_guest_mode_work_pending() with interrupts
+ * disabled in the inner loop before going into guest mode. No need
+ * to disable interrupts here.
+ */
+ ti_work = READ_ONCE(current_thread_info()->flags);
+ if (!(ti_work & XFER_TO_GUEST_MODE_WORK))
+ return 0;
+
+ return xfer_to_guest_mode_work(vcpu, ti_work);
+}
+EXPORT_SYMBOL_GPL(xfer_to_guest_mode_handle_work);
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 334d48b16c36..58cbe357fb2b 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -149,7 +149,7 @@ void put_callchain_buffers(void)
}
}
-static struct perf_callchain_entry *get_callchain_entry(int *rctx)
+struct perf_callchain_entry *get_callchain_entry(int *rctx)
{
int cpu;
struct callchain_cpus_entries *entries;
@@ -159,8 +159,10 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
return NULL;
entries = rcu_dereference(callchain_cpus_entries);
- if (!entries)
+ if (!entries) {
+ put_recursion_context(this_cpu_ptr(callchain_recursion), *rctx);
return NULL;
+ }
cpu = smp_processor_id();
@@ -168,7 +170,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
(*rctx * perf_callchain_entry__sizeof()));
}
-static void
+void
put_callchain_entry(int rctx)
{
put_recursion_context(this_cpu_ptr(callchain_recursion), rctx);
@@ -183,11 +185,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
int rctx;
entry = get_callchain_entry(&rctx);
- if (rctx == -1)
- return NULL;
-
if (!entry)
- goto exit_put;
+ return NULL;
ctx.entry = entry;
ctx.max_stack = max_stack;
@@ -218,10 +217,9 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
- fs = get_fs();
- set_fs(USER_DS);
+ fs = force_uaccess_begin();
perf_callchain_user(&ctx, regs);
- set_fs(fs);
+ force_uaccess_end(fs);
}
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7c436d705fbd..5bfe8e3c6e44 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6453,10 +6453,9 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
/* Data. */
sp = perf_user_stack_pointer(regs);
- fs = get_fs();
- set_fs(USER_DS);
+ fs = force_uaccess_begin();
rem = __output_copy_user(handle, (void *) sp, dump_size);
- set_fs(fs);
+ force_uaccess_end(fs);
dyn_size = dump_size - rem;
perf_output_skip(handle, rem);
@@ -9644,6 +9643,24 @@ static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
if (IS_ERR(prog))
return PTR_ERR(prog);
+ if (event->attr.precise_ip &&
+ prog->call_get_stack &&
+ (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) ||
+ event->attr.exclude_callchain_kernel ||
+ event->attr.exclude_callchain_user)) {
+ /*
+ * On perf_event with precise_ip, calling bpf_get_stack()
+ * may trigger unwinder warnings and occasional crashes.
+ * bpf_get_[stack|stackid] works around this issue by using
+ * callchain attached to perf_sample_data. If the
+ * perf_event does not full (kernel and user) callchain
+ * attached to perf_sample_data, do not allow attaching BPF
+ * program that calls bpf_get_[stack|stackid].
+ */
+ bpf_prog_put(prog);
+ return -EPROTO;
+ }
+
event->prog = prog;
event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
@@ -11585,7 +11602,7 @@ SYSCALL_DEFINE5(perf_event_open,
struct perf_event *group_leader = NULL, *output_event = NULL;
struct perf_event *event, *sibling;
struct perf_event_attr attr;
- struct perf_event_context *ctx, *uninitialized_var(gctx);
+ struct perf_event_context *ctx, *gctx;
struct file *event_file = NULL;
struct fd group = {NULL, 0};
struct task_struct *task = NULL;
@@ -11689,7 +11706,7 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_task;
/*
- * Reuse ptrace permission checks for now.
+ * Preserve ptrace permission check for backwards compatibility.
*
* We must hold exec_update_mutex across this and any potential
* perf_install_in_context() call for this new event to
@@ -11697,7 +11714,7 @@ SYSCALL_DEFINE5(perf_event_open,
* perf_event_exit_task() that could imply).
*/
err = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
+ if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
goto err_cred;
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 5f8b0c52fd2e..649fd53dc9ad 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -184,7 +184,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
if (new_page) {
get_page(new_page);
page_add_new_anon_rmap(new_page, vma, addr, false);
- lru_cache_add_active_or_unevictable(new_page, vma);
+ lru_cache_add_inactive_or_unevictable(new_page, vma);
} else
/* no new page, just dec_mm_counter for old_page */
dec_mm_counter(mm, MM_ANONPAGES);
@@ -376,7 +376,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
if (!vaddr || !d)
return -EINVAL;
- ret = get_user_pages_remote(NULL, mm, vaddr, 1,
+ ret = get_user_pages_remote(mm, vaddr, 1,
FOLL_WRITE, &page, &vma, NULL);
if (unlikely(ret <= 0)) {
/*
@@ -477,7 +477,7 @@ retry:
if (is_register)
gup_flags |= FOLL_SPLIT_PMD;
/* Read the page with vaddr into memory */
- ret = get_user_pages_remote(NULL, mm, vaddr, 1, gup_flags,
+ ret = get_user_pages_remote(mm, vaddr, 1, gup_flags,
&old_page, &vma, NULL);
if (ret <= 0)
return ret;
@@ -2029,7 +2029,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
* but we treat this as a 'remote' access since it is
* essentially a kernel access to the memory.
*/
- result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page,
+ result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page,
NULL, NULL);
if (result < 0)
return result;
@@ -2189,7 +2189,7 @@ static void handle_swbp(struct pt_regs *regs)
{
struct uprobe *uprobe;
unsigned long bp_vaddr;
- int uninitialized_var(is_swbp);
+ int is_swbp;
bp_vaddr = uprobe_get_swbp_addr(regs);
if (bp_vaddr == get_trampoline_vaddr())
diff --git a/kernel/exit.c b/kernel/exit.c
index 727150f28103..733e80f334e7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -93,7 +93,7 @@ static void __exit_signal(struct task_struct *tsk)
struct signal_struct *sig = tsk->signal;
bool group_dead = thread_group_leader(tsk);
struct sighand_struct *sighand;
- struct tty_struct *uninitialized_var(tty);
+ struct tty_struct *tty;
u64 utime, stime;
sighand = rcu_dereference_check(tsk->sighand,
@@ -217,6 +217,7 @@ repeat:
}
write_unlock_irq(&tasklist_lock);
+ seccomp_filter_release(p);
proc_flush_pid(thread_pid);
put_pid(thread_pid);
release_thread(p);
@@ -731,7 +732,7 @@ void __noreturn do_exit(long code)
* mm_release()->clear_child_tid() from writing to a user-controlled
* kernel address.
*/
- set_fs(USER_DS);
+ force_uaccess_begin();
if (unlikely(in_atomic())) {
pr_info("note: %s[%d] exited with preempt_count %d\n",
@@ -804,7 +805,6 @@ void __noreturn do_exit(long code)
exit_task_namespaces(tsk);
exit_task_work(tsk);
exit_thread(tsk);
- exit_umh(tsk);
/*
* Flush inherited counters to the parent - before the parent
@@ -1626,6 +1626,22 @@ long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
return ret;
}
+int kernel_wait(pid_t pid, int *stat)
+{
+ struct wait_opts wo = {
+ .wo_type = PIDTYPE_PID,
+ .wo_pid = find_get_pid(pid),
+ .wo_flags = WEXITED,
+ };
+ int ret;
+
+ ret = do_wait(&wo);
+ if (ret > 0 && wo.wo_stat)
+ *stat = wo.wo_stat;
+ put_pid(wo.wo_pid);
+ return ret;
+}
+
SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
int, options, struct rusage __user *, ru)
{
@@ -1711,6 +1727,30 @@ Efault:
}
#endif
+/**
+ * thread_group_exited - check that a thread group has exited
+ * @pid: tgid of thread group to be checked.
+ *
+ * Test if the thread group represented by tgid has exited (all
+ * threads are zombies, dead or completely gone).
+ *
+ * Return: true if the thread group has exited. false otherwise.
+ */
+bool thread_group_exited(struct pid *pid)
+{
+ struct task_struct *task;
+ bool exited;
+
+ rcu_read_lock();
+ task = pid_task(pid, PIDTYPE_PID);
+ exited = !task ||
+ (READ_ONCE(task->exit_state) && thread_group_empty(task));
+ rcu_read_unlock();
+
+ return exited;
+}
+EXPORT_SYMBOL(thread_group_exited);
+
__weak void abort(void)
{
BUG();
diff --git a/kernel/fork.c b/kernel/fork.c
index 2a8e7287a558..4d32190861bd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -261,7 +261,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
THREAD_SIZE_ORDER);
if (likely(page)) {
- tsk->stack = page_address(page);
+ tsk->stack = kasan_reset_tag(page_address(page));
return tsk->stack;
}
return NULL;
@@ -276,13 +276,8 @@ static inline void free_thread_stack(struct task_struct *tsk)
if (vm) {
int i;
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
- mod_memcg_page_state(vm->pages[i],
- MEMCG_KERNEL_STACK_KB,
- -(int)(PAGE_SIZE / 1024));
-
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
memcg_kmem_uncharge_page(vm->pages[i], 0);
- }
for (i = 0; i < NR_CACHED_STACKS; i++) {
if (this_cpu_cmpxchg(cached_stacks[i],
@@ -307,6 +302,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
{
unsigned long *stack;
stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
+ stack = kasan_reset_tag(stack);
tsk->stack = stack;
return stack;
}
@@ -382,31 +378,14 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
void *stack = task_stack_page(tsk);
struct vm_struct *vm = task_stack_vm_area(tsk);
- BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
-
- if (vm) {
- int i;
-
- BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
-
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
- mod_zone_page_state(page_zone(vm->pages[i]),
- NR_KERNEL_STACK_KB,
- PAGE_SIZE / 1024 * account);
- }
- } else {
- /*
- * All stack pages are in the same zone and belong to the
- * same memcg.
- */
- struct page *first_page = virt_to_page(stack);
- mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
- THREAD_SIZE / 1024 * account);
-
- mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB,
- account * (THREAD_SIZE / 1024));
- }
+ /* All stack pages are in the same node. */
+ if (vm)
+ mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB,
+ account * (THREAD_SIZE / 1024));
+ else
+ mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB,
+ account * (THREAD_SIZE / 1024));
}
static int memcg_charge_kernel_stack(struct task_struct *tsk)
@@ -415,24 +394,23 @@ static int memcg_charge_kernel_stack(struct task_struct *tsk)
struct vm_struct *vm = task_stack_vm_area(tsk);
int ret;
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+
if (vm) {
int i;
+ BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
/*
* If memcg_kmem_charge_page() fails, page->mem_cgroup
- * pointer is NULL, and both memcg_kmem_uncharge_page()
- * and mod_memcg_page_state() in free_thread_stack()
- * will ignore this page. So it's safe.
+ * pointer is NULL, and memcg_kmem_uncharge_page() in
+ * free_thread_stack() will ignore this page.
*/
ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
0);
if (ret)
return ret;
-
- mod_memcg_page_state(vm->pages[i],
- MEMCG_KERNEL_STACK_KB,
- PAGE_SIZE / 1024);
}
}
#endif
@@ -479,7 +457,6 @@ void free_task(struct task_struct *tsk)
#endif
rt_mutex_debug_task_free(tsk);
ftrace_graph_exit_task(tsk);
- put_seccomp_filter(tsk);
arch_release_task_struct(tsk);
if (tsk->flags & PF_KTHREAD)
free_kthread_struct(tsk);
@@ -1480,7 +1457,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
goto out;
}
- newf = dup_fd(oldf, &error);
+ newf = dup_fd(oldf, NR_OPEN_MAX, &error);
if (!newf)
goto out;
@@ -1793,22 +1770,18 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
*/
static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
{
- struct task_struct *task;
struct pid *pid = file->private_data;
__poll_t poll_flags = 0;
poll_wait(file, &pid->wait_pidfd, pts);
- rcu_read_lock();
- task = pid_task(pid, PIDTYPE_PID);
/*
* Inform pollers only when the whole thread group exits.
* If the thread group leader exits before all other threads in the
* group, then poll(2) should block, similar to the wait(2) family.
*/
- if (!task || (task->exit_state && thread_group_empty(task)))
+ if (thread_group_exited(pid))
poll_flags = EPOLLIN | EPOLLRDNORM;
- rcu_read_unlock();
return poll_flags;
}
@@ -2038,7 +2011,7 @@ static __latent_entropy struct task_struct *copy_process(
#ifdef CONFIG_CPUSETS
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
- seqcount_init(&p->mems_allowed_seq);
+ seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
memset(&p->irqtrace, 0, sizeof(p->irqtrace));
@@ -2102,8 +2075,7 @@ static __latent_entropy struct task_struct *copy_process(
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
- retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p,
- args->tls);
+ retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls);
if (retval)
goto bad_fork_cleanup_io;
@@ -2422,6 +2394,20 @@ long _do_fork(struct kernel_clone_args *args)
long nr;
/*
+ * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
+ * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
+ * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
+ * field in struct clone_args and it still doesn't make sense to have
+ * them both point at the same memory location. Performing this check
+ * here has the advantage that we don't need to have a separate helper
+ * to check for legacy clone().
+ */
+ if ((args->flags & CLONE_PIDFD) &&
+ (args->flags & CLONE_PARENT_SETTID) &&
+ (args->pidfd == args->parent_tid))
+ return -EINVAL;
+
+ /*
* Determine whether and which event to report to ptracer. When
* called from kernel_thread or CLONE_UNTRACED is explicitly
* requested, no event is reported; otherwise, report if the event
@@ -2478,42 +2464,6 @@ long _do_fork(struct kernel_clone_args *args)
return nr;
}
-bool legacy_clone_args_valid(const struct kernel_clone_args *kargs)
-{
- /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
- if ((kargs->flags & CLONE_PIDFD) &&
- (kargs->flags & CLONE_PARENT_SETTID))
- return false;
-
- return true;
-}
-
-#ifndef CONFIG_HAVE_COPY_THREAD_TLS
-/* For compatibility with architectures that call do_fork directly rather than
- * using the syscall entry points below. */
-long do_fork(unsigned long clone_flags,
- unsigned long stack_start,
- unsigned long stack_size,
- int __user *parent_tidptr,
- int __user *child_tidptr)
-{
- struct kernel_clone_args args = {
- .flags = (lower_32_bits(clone_flags) & ~CSIGNAL),
- .pidfd = parent_tidptr,
- .child_tid = child_tidptr,
- .parent_tid = parent_tidptr,
- .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL),
- .stack = stack_start,
- .stack_size = stack_size,
- };
-
- if (!legacy_clone_args_valid(&args))
- return -EINVAL;
-
- return _do_fork(&args);
-}
-#endif
-
/*
* Create a kernel thread.
*/
@@ -2592,24 +2542,12 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
.tls = tls,
};
- if (!legacy_clone_args_valid(&args))
- return -EINVAL;
-
return _do_fork(&args);
}
#endif
#ifdef __ARCH_WANT_SYS_CLONE3
-/*
- * copy_thread implementations handle CLONE_SETTLS by reading the TLS value from
- * the registers containing the syscall arguments for clone. This doesn't work
- * with clone3 since the TLS value is passed in clone_args instead.
- */
-#ifndef CONFIG_HAVE_COPY_THREAD_TLS
-#error clone3 requires copy_thread_tls support in arch
-#endif
-
noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
struct clone_args __user *uargs,
size_t usize)
@@ -2906,14 +2844,15 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
/*
* Unshare file descriptor table if it is being shared
*/
-static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
+int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
+ struct files_struct **new_fdp)
{
struct files_struct *fd = current->files;
int error = 0;
if ((unshare_flags & CLONE_FILES) &&
(fd && atomic_read(&fd->count) > 1)) {
- *new_fdp = dup_fd(fd, &error);
+ *new_fdp = dup_fd(fd, max_fds, &error);
if (!*new_fdp)
return error;
}
@@ -2924,7 +2863,7 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
/*
* unshare allows a process to 'unshare' part of the process
* context which was originally shared using clone. copy_*
- * functions used by do_fork() cannot be used here directly
+ * functions used by _do_fork() cannot be used here directly
* because they modify an inactive task_struct that is being
* constructed. Here we are modifying the current, active,
* task_struct.
@@ -2973,7 +2912,7 @@ int ksys_unshare(unsigned long unshare_flags)
err = unshare_fs(unshare_flags, &new_fs);
if (err)
goto bad_unshare_out;
- err = unshare_fd(unshare_flags, &new_fd);
+ err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
if (err)
goto bad_unshare_cleanup_fs;
err = unshare_userns(unshare_flags, &new_cred);
@@ -3062,7 +3001,7 @@ int unshare_files(struct files_struct **displaced)
struct files_struct *copy = NULL;
int error;
- error = unshare_fd(CLONE_FILES, &copy);
+ error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, &copy);
if (error || !copy) {
*displaced = NULL;
return error;
diff --git a/kernel/futex.c b/kernel/futex.c
index 4616d4ad609d..a5876694a60e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -678,7 +678,7 @@ static int fault_in_user_writeable(u32 __user *uaddr)
int ret;
mmap_read_lock(mm);
- ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
+ ret = fixup_user_fault(mm, (unsigned long)uaddr,
FAULT_FLAG_WRITE, NULL);
mmap_read_unlock(mm);
@@ -1305,7 +1305,7 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval,
static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
{
int err;
- u32 uninitialized_var(curval);
+ u32 curval;
if (unlikely(should_fail_futex(true)))
return -EFAULT;
@@ -1475,7 +1475,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
*/
static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
{
- u32 uninitialized_var(curval), newval;
+ u32 curval, newval;
struct task_struct *new_owner;
bool postunlock = false;
DEFINE_WAKE_Q(wake_q);
@@ -2325,7 +2325,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
struct task_struct *argowner)
{
struct futex_pi_state *pi_state = q->pi_state;
- u32 uval, uninitialized_var(curval), newval;
+ u32 uval, curval, newval;
struct task_struct *oldowner, *newowner;
u32 newtid;
int ret, err = 0;
@@ -2942,7 +2942,7 @@ uaddr_faulted:
*/
static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
{
- u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
+ u32 curval, uval, vpid = task_pid_vnr(current);
union futex_key key = FUTEX_KEY_INIT;
struct futex_hash_bucket *hb;
struct futex_q *top_waiter;
@@ -3417,7 +3417,7 @@ err_unlock:
static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
bool pi, bool pending_op)
{
- u32 uval, uninitialized_var(nval), mval;
+ u32 uval, nval, mval;
int err;
/* Futex address must be 32bit aligned */
@@ -3547,7 +3547,7 @@ static void exit_robust_list(struct task_struct *curr)
struct robust_list_head __user *head = curr->robust_list;
struct robust_list __user *entry, *next_entry, *pending;
unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
- unsigned int uninitialized_var(next_pi);
+ unsigned int next_pi;
unsigned long futex_offset;
int rc;
@@ -3744,12 +3744,12 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
switch (cmd) {
case FUTEX_WAIT:
val3 = FUTEX_BITSET_MATCH_ANY;
- /* fall through */
+ fallthrough;
case FUTEX_WAIT_BITSET:
return futex_wait(uaddr, flags, val, timeout, val3);
case FUTEX_WAKE:
val3 = FUTEX_BITSET_MATCH_ANY;
- /* fall through */
+ fallthrough;
case FUTEX_WAKE_BITSET:
return futex_wake(uaddr, flags, val, val3);
case FUTEX_REQUEUE:
@@ -3847,7 +3847,7 @@ static void compat_exit_robust_list(struct task_struct *curr)
struct compat_robust_list_head __user *head = curr->compat_robust_list;
struct robust_list __user *entry, *next_entry, *pending;
unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
- unsigned int uninitialized_var(next_pi);
+ unsigned int next_pi;
compat_uptr_t uentry, next_uentry, upending;
compat_long_t futex_offset;
int rc;
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 20512252ecc9..10a5aff4eecc 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -51,10 +51,6 @@ config GENERIC_IRQ_INJECTION
config HARDIRQS_SW_RESEND
bool
-# Preflow handler support for fasteoi (sparc64)
-config IRQ_PREFLOW_FASTEOI
- bool
-
# Edge style eoi based handler (cell)
config IRQ_EDGE_EOI_HANDLER
bool
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 41e7e37a0928..857f5f4c8098 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -656,16 +656,6 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(handle_level_irq);
-#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
-static inline void preflow_handler(struct irq_desc *desc)
-{
- if (desc->preflow_handler)
- desc->preflow_handler(&desc->irq_data);
-}
-#else
-static inline void preflow_handler(struct irq_desc *desc) { }
-#endif
-
static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip)
{
if (!(desc->istate & IRQS_ONESHOT)) {
@@ -721,7 +711,6 @@ void handle_fasteoi_irq(struct irq_desc *desc)
if (desc->istate & IRQS_ONESHOT)
mask_irq(desc);
- preflow_handler(desc);
handle_irq_event(desc);
cond_unmask_eoi_irq(desc, chip);
@@ -1231,7 +1220,6 @@ void handle_fasteoi_ack_irq(struct irq_desc *desc)
/* Start handling the irq */
desc->irq_data.chip->irq_ack(&desc->irq_data);
- preflow_handler(desc);
handle_irq_event(desc);
cond_unmask_eoi_irq(desc, chip);
@@ -1281,7 +1269,6 @@ void handle_fasteoi_mask_irq(struct irq_desc *desc)
if (desc->istate & IRQS_ONESHOT)
mask_irq(desc);
- preflow_handler(desc);
handle_irq_event(desc);
cond_unmask_eoi_irq(desc, chip);
@@ -1478,6 +1465,7 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data)
return 0;
}
+EXPORT_SYMBOL_GPL(irq_chip_retrigger_hierarchy);
/**
* irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt
@@ -1492,7 +1480,7 @@ int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info)
return -ENOSYS;
}
-
+EXPORT_SYMBOL_GPL(irq_chip_set_vcpu_affinity_parent);
/**
* irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt
* @data: Pointer to interrupt specific data
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index a4c2c915511d..76cd7ebd1178 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -142,7 +142,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
if (!domain)
return NULL;
- if (fwnode && is_fwnode_irqchip(fwnode)) {
+ if (is_fwnode_irqchip(fwnode)) {
fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
switch (fwid->type) {
@@ -281,6 +281,7 @@ void irq_domain_update_bus_token(struct irq_domain *domain,
mutex_unlock(&irq_domain_mutex);
}
+EXPORT_SYMBOL_GPL(irq_domain_update_bus_token);
/**
* irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 48c38e09c673..52ac5391dcc6 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1308,9 +1308,6 @@ static int
setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
{
struct task_struct *t;
- struct sched_param param = {
- .sched_priority = MAX_USER_RT_PRIO/2,
- };
if (!secondary) {
t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
@@ -1318,13 +1315,12 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
} else {
t = kthread_create(irq_thread, new, "irq/%d-s-%s", irq,
new->name);
- param.sched_priority -= 1;
}
if (IS_ERR(t))
return PTR_ERR(t);
- sched_setscheduler_nocheck(t, SCHED_FIFO, &param);
+ sched_set_fifo(t);
/*
* We keep the reference to the task struct even if
@@ -2735,8 +2731,10 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
do {
chip = irq_data_get_irq_chip(data);
- if (WARN_ON_ONCE(!chip))
- return -ENODEV;
+ if (WARN_ON_ONCE(!chip)) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
if (chip->irq_set_irqchip_state)
break;
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
@@ -2749,6 +2747,7 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
if (data)
err = chip->irq_set_irqchip_state(data, which, val);
+out_unlock:
irq_put_desc_busunlock(desc, flags);
return err;
}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 8f557fa1f4fe..c6c7e187ae74 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -185,14 +185,18 @@ void rearm_wake_irq(unsigned int irq)
unsigned long flags;
struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
- if (!desc || !(desc->istate & IRQS_SUSPENDED) ||
- !irqd_is_wakeup_set(&desc->irq_data))
+ if (!desc)
return;
+ if (!(desc->istate & IRQS_SUSPENDED) ||
+ !irqd_is_wakeup_set(&desc->irq_data))
+ goto unlock;
+
desc->istate &= ~IRQS_SUSPENDED;
irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED);
__enable_irq(desc);
+unlock:
irq_put_desc_busunlock(desc, flags);
}
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 27634f4022d0..c48ce19a257f 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -45,7 +45,7 @@ static void resend_irqs(unsigned long arg)
}
/* Tasklet to handle resend: */
-static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
+static DECLARE_TASKLET_OLD(resend_tasklet, resend_irqs);
static int irq_sw_resend(struct irq_desc *desc)
{
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 6afae0bcbac4..6b8368be89c8 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -96,7 +96,7 @@ struct kcov_percpu_data {
int saved_sequence;
};
-DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data);
+static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data);
/* Must be called with kcov_remote_lock locked. */
static struct kcov_remote *kcov_remote_find(u64 handle)
@@ -775,7 +775,7 @@ static inline bool kcov_mode_enabled(unsigned int mode)
return (mode & ~KCOV_IN_CTXSW) != KCOV_MODE_DISABLED;
}
-void kcov_remote_softirq_start(struct task_struct *t)
+static void kcov_remote_softirq_start(struct task_struct *t)
{
struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data);
unsigned int mode;
@@ -792,7 +792,7 @@ void kcov_remote_softirq_start(struct task_struct *t)
}
}
-void kcov_remote_softirq_stop(struct task_struct *t)
+static void kcov_remote_softirq_stop(struct task_struct *t)
{
struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data);
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 09cc78df53c6..ca40bef75a61 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -265,7 +265,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
goto out;
}
- ima_kexec_cmdline(image->cmdline_buf,
+ ima_kexec_cmdline(kernel_fd, image->cmdline_buf,
image->cmdline_buf_len - 1);
}
@@ -636,6 +636,19 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
}
/**
+ * arch_kexec_locate_mem_hole - Find free memory to place the segments.
+ * @kbuf: Parameters for the memory search.
+ *
+ * On success, kbuf->mem will have the start address of the memory region found.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int __weak arch_kexec_locate_mem_hole(struct kexec_buf *kbuf)
+{
+ return kexec_locate_mem_hole(kbuf);
+}
+
+/**
* kexec_add_buffer - place a buffer in a kexec segment
* @kbuf: Buffer contents and memory parameters.
*
@@ -647,7 +660,6 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
*/
int kexec_add_buffer(struct kexec_buf *kbuf)
{
-
struct kexec_segment *ksegment;
int ret;
@@ -675,7 +687,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE);
/* Walk the RAM ranges and allocate a suitable range for the buffer */
- ret = kexec_locate_mem_hole(kbuf);
+ ret = arch_kexec_locate_mem_hole(kbuf);
if (ret)
return ret;
@@ -1157,24 +1169,26 @@ int crash_exclude_mem_range(struct crash_mem *mem,
unsigned long long mstart, unsigned long long mend)
{
int i, j;
- unsigned long long start, end;
+ unsigned long long start, end, p_start, p_end;
struct crash_mem_range temp_range = {0, 0};
for (i = 0; i < mem->nr_ranges; i++) {
start = mem->ranges[i].start;
end = mem->ranges[i].end;
+ p_start = mstart;
+ p_end = mend;
if (mstart > end || mend < start)
continue;
/* Truncate any area outside of range */
if (mstart < start)
- mstart = start;
+ p_start = start;
if (mend > end)
- mend = end;
+ p_end = end;
/* Found completely overlapping range */
- if (mstart == start && mend == end) {
+ if (p_start == start && p_end == end) {
mem->ranges[i].start = 0;
mem->ranges[i].end = 0;
if (i < mem->nr_ranges - 1) {
@@ -1185,20 +1199,29 @@ int crash_exclude_mem_range(struct crash_mem *mem,
mem->ranges[j].end =
mem->ranges[j+1].end;
}
+
+ /*
+ * Continue to check if there are another overlapping ranges
+ * from the current position because of shifting the above
+ * mem ranges.
+ */
+ i--;
+ mem->nr_ranges--;
+ continue;
}
mem->nr_ranges--;
return 0;
}
- if (mstart > start && mend < end) {
+ if (p_start > start && p_end < end) {
/* Split original range */
- mem->ranges[i].end = mstart - 1;
- temp_range.start = mend + 1;
+ mem->ranges[i].end = p_start - 1;
+ temp_range.start = p_end + 1;
temp_range.end = end;
- } else if (mstart != start)
- mem->ranges[i].end = mstart - 1;
+ } else if (p_start != start)
+ mem->ranges[i].end = p_start - 1;
else
- mem->ranges[i].start = mend + 1;
+ mem->ranges[i].start = p_end + 1;
break;
}
@@ -1235,7 +1258,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
unsigned long long notes_addr;
unsigned long mstart, mend;
- /* extra phdr for vmcoreinfo elf note */
+ /* extra phdr for vmcoreinfo ELF note */
nr_phdr = nr_cpus + 1;
nr_phdr += mem->nr_ranges;
@@ -1243,7 +1266,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
* kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
* area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64).
* I think this is required by tools like gdb. So same physical
- * memory will be mapped in two elf headers. One will contain kernel
+ * memory will be mapped in two ELF headers. One will contain kernel
* text virtual addresses and other will have __va(physical) addresses.
*/
@@ -1270,7 +1293,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
ehdr->e_ehsize = sizeof(Elf64_Ehdr);
ehdr->e_phentsize = sizeof(Elf64_Phdr);
- /* Prepare one phdr of type PT_NOTE for each present cpu */
+ /* Prepare one phdr of type PT_NOTE for each present CPU */
for_each_present_cpu(cpu) {
phdr->p_type = PT_NOTE;
notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
@@ -1312,10 +1335,10 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
phdr->p_align = 0;
ehdr->e_phnum++;
- phdr++;
- pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
+ pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
ehdr->e_phnum, phdr->p_offset);
+ phdr++;
}
*addr = buf;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 37c3c4b97b8e..3cd075ce2a1e 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -36,9 +36,8 @@
*
* If you need less than 50 threads would mean we're dealing with systems
* smaller than 3200 pages. This assumes you are capable of having ~13M memory,
- * and this would only be an be an upper limit, after which the OOM killer
- * would take effect. Systems like these are very unlikely if modules are
- * enabled.
+ * and this would only be an upper limit, after which the OOM killer would take
+ * effect. Systems like these are very unlikely if modules are enabled.
*/
#define MAX_KMOD_CONCURRENT 50
static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e87679a48ba2..287b263c9cb9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1111,9 +1111,20 @@ static int disarm_kprobe_ftrace(struct kprobe *p)
ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
}
#else /* !CONFIG_KPROBES_ON_FTRACE */
-#define prepare_kprobe(p) arch_prepare_kprobe(p)
-#define arm_kprobe_ftrace(p) (-ENODEV)
-#define disarm_kprobe_ftrace(p) (-ENODEV)
+static inline int prepare_kprobe(struct kprobe *p)
+{
+ return arch_prepare_kprobe(p);
+}
+
+static inline int arm_kprobe_ftrace(struct kprobe *p)
+{
+ return -ENODEV;
+}
+
+static inline int disarm_kprobe_ftrace(struct kprobe *p)
+{
+ return -ENODEV;
+}
#endif
/* Arm a kprobe with text_mutex */
@@ -2145,6 +2156,13 @@ static void kill_kprobe(struct kprobe *p)
* the original probed function (which will be freed soon) any more.
*/
arch_remove_kprobe(p);
+
+ /*
+ * The module is going away. We should disarm the kprobe which
+ * is using ftrace.
+ */
+ if (kprobe_ftrace(p))
+ disarm_kprobe_ftrace(p);
}
/* Disable one kprobe */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1d9e2fdfd67a..3edaa380dc7b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -480,7 +480,6 @@ EXPORT_SYMBOL(kthread_bind);
* to "name.*%u". Code fills in cpu number.
*
* Description: This helper function creates and names a kernel thread
- * The thread will be woken and put into park mode.
*/
struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
void *data, unsigned int cpu,
@@ -1241,13 +1240,16 @@ void kthread_use_mm(struct mm_struct *mm)
WARN_ON_ONCE(tsk->mm);
task_lock(tsk);
+ /* Hold off tlb flush IPIs while switching mm's */
+ local_irq_disable();
active_mm = tsk->active_mm;
if (active_mm != mm) {
mmgrab(mm);
tsk->active_mm = mm;
}
tsk->mm = mm;
- switch_mm(active_mm, mm, tsk);
+ switch_mm_irqs_off(active_mm, mm, tsk);
+ local_irq_enable();
task_unlock(tsk);
#ifdef finish_arch_post_lock_switch
finish_arch_post_lock_switch();
@@ -1256,8 +1258,7 @@ void kthread_use_mm(struct mm_struct *mm)
if (active_mm != mm)
mmdrop(active_mm);
- to_kthread(tsk)->oldfs = get_fs();
- set_fs(USER_DS);
+ to_kthread(tsk)->oldfs = force_uaccess_begin();
}
EXPORT_SYMBOL_GPL(kthread_use_mm);
@@ -1272,13 +1273,15 @@ void kthread_unuse_mm(struct mm_struct *mm)
WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
WARN_ON_ONCE(!tsk->mm);
- set_fs(to_kthread(tsk)->oldfs);
+ force_uaccess_end(to_kthread(tsk)->oldfs);
task_lock(tsk);
sync_mm_rss(mm);
+ local_irq_disable();
tsk->mm = NULL;
/* active_mm is still 'mm' */
enter_lazy_tlb(mm, tsk);
+ local_irq_enable();
task_unlock(tsk);
}
EXPORT_SYMBOL_GPL(kthread_unuse_mm);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index f361d75a75ac..2fad21d345b0 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1723,7 +1723,7 @@ static int noop_count(struct lock_list *entry, void *data)
static unsigned long __lockdep_count_forward_deps(struct lock_list *this)
{
unsigned long count = 0;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
__bfs_forwards(this, (void *)&count, noop_count, &target_entry);
@@ -1749,7 +1749,7 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
static unsigned long __lockdep_count_backward_deps(struct lock_list *this)
{
unsigned long count = 0;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
__bfs_backwards(this, (void *)&count, noop_count, &target_entry);
@@ -1804,7 +1804,7 @@ check_noncircular(struct held_lock *src, struct held_lock *target,
struct lock_trace **const trace)
{
int ret;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
struct lock_list src_entry = {
.class = hlock_class(src),
.parent = NULL,
@@ -1842,7 +1842,7 @@ static noinline int
check_redundant(struct held_lock *src, struct held_lock *target)
{
int ret;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
struct lock_list src_entry = {
.class = hlock_class(src),
.parent = NULL,
@@ -2244,8 +2244,8 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
{
unsigned long usage_mask = 0, forward_mask, backward_mask;
enum lock_usage_bit forward_bit = 0, backward_bit = 0;
- struct lock_list *uninitialized_var(target_entry1);
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry1;
+ struct lock_list *target_entry;
struct lock_list this, that;
int ret;
@@ -3438,7 +3438,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
{
int ret;
struct lock_list root;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
root.parent = NULL;
root.class = hlock_class(this);
@@ -3465,7 +3465,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
{
int ret;
struct lock_list root;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
root.parent = NULL;
root.class = hlock_class(this);
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 5525cd3ba0c8..02ef87f50df2 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -423,7 +423,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
seq_time(m, lt->min);
seq_time(m, lt->max);
seq_time(m, lt->total);
- seq_time(m, lt->nr ? div_s64(lt->total, lt->nr) : 0);
+ seq_time(m, lt->nr ? div64_u64(lt->total, lt->nr) : 0);
}
static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 8ff6f50e06a0..9cfa5e89cff7 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -436,8 +436,6 @@ static int torture_rtmutex_lock(void) __acquires(torture_rtmutex)
static void torture_rtmutex_boost(struct torture_random_state *trsp)
{
- int policy;
- struct sched_param param;
const unsigned int factor = 50000; /* yes, quite arbitrary */
if (!rt_task(current)) {
@@ -448,8 +446,7 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp)
*/
if (trsp && !(torture_random(trsp) %
(cxt.nrealwriters_stress * factor))) {
- policy = SCHED_FIFO;
- param.sched_priority = MAX_RT_PRIO - 1;
+ sched_set_fifo(current);
} else /* common case, do nothing */
return;
} else {
@@ -462,13 +459,10 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp)
*/
if (!trsp || !(torture_random(trsp) %
(cxt.nrealwriters_stress * factor * 2))) {
- policy = SCHED_NORMAL;
- param.sched_priority = 0;
+ sched_set_normal(current, 0);
} else /* common case, do nothing */
return;
}
-
- sched_setscheduler_nocheck(current, policy, &param);
}
static void torture_rtmutex_delay(struct torture_random_state *trsp)
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index b9515fcc9b29..cbff6ba53d56 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -581,4 +581,11 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
#include "qspinlock_paravirt.h"
#include "qspinlock.c"
+bool nopvspin __initdata;
+static __init int parse_nopvspin(char *arg)
+{
+ nopvspin = true;
+ return 0;
+}
+early_param("nopvspin", parse_nopvspin);
#endif
diff --git a/kernel/module.c b/kernel/module.c
index aa183c9ac0a2..1c5cff34d9f2 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -422,7 +422,7 @@ static bool each_symbol_in_section(const struct symsearch *arr,
}
/* Returns true as soon as fn returns true, otherwise false. */
-bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
+static bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
struct module *owner,
void *data),
void *data)
@@ -484,7 +484,6 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
}
return false;
}
-EXPORT_SYMBOL_GPL(each_symbol_section);
struct find_symbol_arg {
/* Input */
@@ -496,6 +495,7 @@ struct find_symbol_arg {
struct module *owner;
const s32 *crc;
const struct kernel_symbol *sym;
+ enum mod_license license;
};
static bool check_exported_symbol(const struct symsearch *syms,
@@ -505,9 +505,9 @@ static bool check_exported_symbol(const struct symsearch *syms,
struct find_symbol_arg *fsa = data;
if (!fsa->gplok) {
- if (syms->licence == GPL_ONLY)
+ if (syms->license == GPL_ONLY)
return false;
- if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) {
+ if (syms->license == WILL_BE_GPL_ONLY && fsa->warn) {
pr_warn("Symbol %s is being used by a non-GPL module, "
"which will not be allowed in the future\n",
fsa->name);
@@ -529,6 +529,7 @@ static bool check_exported_symbol(const struct symsearch *syms,
fsa->owner = owner;
fsa->crc = symversion(syms->crcs, symnum);
fsa->sym = &syms->start[symnum];
+ fsa->license = syms->license;
return true;
}
@@ -585,9 +586,10 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms,
/* Find an exported symbol and return it, along with, (optional) crc and
* (optional) module which owns it. Needs preempt disabled or module_mutex. */
-const struct kernel_symbol *find_symbol(const char *name,
+static const struct kernel_symbol *find_symbol(const char *name,
struct module **owner,
const s32 **crc,
+ enum mod_license *license,
bool gplok,
bool warn)
{
@@ -602,13 +604,14 @@ const struct kernel_symbol *find_symbol(const char *name,
*owner = fsa.owner;
if (crc)
*crc = fsa.crc;
+ if (license)
+ *license = fsa.license;
return fsa.sym;
}
pr_debug("Failed to find symbol %s\n", name);
return NULL;
}
-EXPORT_SYMBOL_GPL(find_symbol);
/*
* Search for module by name: must hold module_mutex (or preempt disabled
@@ -869,7 +872,7 @@ static int add_module_usage(struct module *a, struct module *b)
}
/* Module a uses b: caller needs module_mutex() */
-int ref_module(struct module *a, struct module *b)
+static int ref_module(struct module *a, struct module *b)
{
int err;
@@ -888,7 +891,6 @@ int ref_module(struct module *a, struct module *b)
}
return 0;
}
-EXPORT_SYMBOL_GPL(ref_module);
/* Clear the unload stuff of the module. */
static void module_unload_free(struct module *mod)
@@ -1077,7 +1079,7 @@ void __symbol_put(const char *symbol)
struct module *owner;
preempt_disable();
- if (!find_symbol(symbol, &owner, NULL, true, false))
+ if (!find_symbol(symbol, &owner, NULL, NULL, true, false))
BUG();
module_put(owner);
preempt_enable();
@@ -1169,11 +1171,10 @@ static inline void module_unload_free(struct module *mod)
{
}
-int ref_module(struct module *a, struct module *b)
+static int ref_module(struct module *a, struct module *b)
{
return strong_try_module_get(b);
}
-EXPORT_SYMBOL_GPL(ref_module);
static inline int module_unload_init(struct module *mod)
{
@@ -1356,7 +1357,7 @@ static inline int check_modstruct_version(const struct load_info *info,
* locking is necessary -- use preempt_disable() to placate lockdep.
*/
preempt_disable();
- if (!find_symbol("module_layout", NULL, &crc, true, false)) {
+ if (!find_symbol("module_layout", NULL, &crc, NULL, true, false)) {
preempt_enable();
BUG();
}
@@ -1430,6 +1431,24 @@ static int verify_namespace_is_imported(const struct load_info *info,
return 0;
}
+static bool inherit_taint(struct module *mod, struct module *owner)
+{
+ if (!owner || !test_bit(TAINT_PROPRIETARY_MODULE, &owner->taints))
+ return true;
+
+ if (mod->using_gplonly_symbols) {
+ pr_err("%s: module using GPL-only symbols uses symbols from proprietary module %s.\n",
+ mod->name, owner->name);
+ return false;
+ }
+
+ if (!test_bit(TAINT_PROPRIETARY_MODULE, &mod->taints)) {
+ pr_warn("%s: module uses symbols from proprietary module %s, inheriting taint.\n",
+ mod->name, owner->name);
+ set_bit(TAINT_PROPRIETARY_MODULE, &mod->taints);
+ }
+ return true;
+}
/* Resolve a symbol for this module. I.e. if we find one, record usage. */
static const struct kernel_symbol *resolve_symbol(struct module *mod,
@@ -1440,6 +1459,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
struct module *owner;
const struct kernel_symbol *sym;
const s32 *crc;
+ enum mod_license license;
int err;
/*
@@ -1449,11 +1469,19 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
*/
sched_annotate_sleep();
mutex_lock(&module_mutex);
- sym = find_symbol(name, &owner, &crc,
+ sym = find_symbol(name, &owner, &crc, &license,
!(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
if (!sym)
goto unlock;
+ if (license == GPL_ONLY)
+ mod->using_gplonly_symbols = true;
+
+ if (!inherit_taint(mod, owner)) {
+ sym = NULL;
+ goto getname;
+ }
+
if (!check_version(info, name, mod, crc)) {
sym = ERR_PTR(-EINVAL);
goto getname;
@@ -1520,18 +1548,34 @@ struct module_sect_attrs {
struct module_sect_attr attrs[];
};
+#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4))
static ssize_t module_sect_read(struct file *file, struct kobject *kobj,
struct bin_attribute *battr,
char *buf, loff_t pos, size_t count)
{
struct module_sect_attr *sattr =
container_of(battr, struct module_sect_attr, battr);
+ char bounce[MODULE_SECT_READ_SIZE + 1];
+ size_t wrote;
if (pos != 0)
return -EINVAL;
- return sprintf(buf, "0x%px\n",
- kallsyms_show_value(file->f_cred) ? (void *)sattr->address : NULL);
+ /*
+ * Since we're a binary read handler, we must account for the
+ * trailing NUL byte that sprintf will write: if "buf" is
+ * too small to hold the NUL, or the NUL is exactly the last
+ * byte, the read will look like it got truncated by one byte.
+ * Since there is no way to ask sprintf nicely to not write
+ * the NUL, we have to use a bounce buffer.
+ */
+ wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n",
+ kallsyms_show_value(file->f_cred)
+ ? (void *)sattr->address : NULL);
+ count = min(count, wrote);
+ memcpy(buf, bounce, count);
+
+ return count;
}
static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
@@ -1580,7 +1624,7 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info)
goto out;
sect_attrs->nsections++;
sattr->battr.read = module_sect_read;
- sattr->battr.size = 3 /* "0x", "\n" */ + (BITS_PER_LONG / 4);
+ sattr->battr.size = MODULE_SECT_READ_SIZE;
sattr->battr.attr.mode = 0400;
*(gattr++) = &(sattr++)->battr;
}
@@ -2220,7 +2264,7 @@ void *__symbol_get(const char *symbol)
const struct kernel_symbol *sym;
preempt_disable();
- sym = find_symbol(symbol, &owner, NULL, true, true);
+ sym = find_symbol(symbol, &owner, NULL, NULL, true, true);
if (sym && strong_try_module_get(owner))
sym = NULL;
preempt_enable();
@@ -2256,7 +2300,7 @@ static int verify_exported_symbols(struct module *mod)
for (i = 0; i < ARRAY_SIZE(arr); i++) {
for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
if (find_symbol(kernel_symbol_name(s), &owner, NULL,
- true, false)) {
+ NULL, true, false)) {
pr_err("%s: exports duplicate symbol %s"
" (owned by %s)\n",
mod->name, kernel_symbol_name(s),
@@ -3237,7 +3281,7 @@ static int find_module_sections(struct module *mod, struct load_info *info)
if (section_addr(info, "__obsparm"))
pr_warn("%s: Ignoring obsolete parameters\n", mod->name);
- info->debug = section_objs(info, "__verbose",
+ info->debug = section_objs(info, "__dyndbg",
sizeof(*info->debug), &info->num_debug);
return 0;
@@ -4473,7 +4517,6 @@ struct module *__module_address(unsigned long addr)
}
return mod;
}
-EXPORT_SYMBOL_GPL(__module_address);
/*
* is_module_text_address - is this address inside module code?
@@ -4512,7 +4555,6 @@ struct module *__module_text_address(unsigned long addr)
}
return mod;
}
-EXPORT_SYMBOL_GPL(__module_text_address);
/* Don't grab lock, we're oopsing. */
void print_modules(void)
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index cd356630a311..12dd41b39a7f 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -262,8 +262,8 @@ void exit_task_namespaces(struct task_struct *p)
static int check_setns_flags(unsigned long flags)
{
if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWNET | CLONE_NEWUSER | CLONE_NEWPID |
- CLONE_NEWCGROUP)))
+ CLONE_NEWNET | CLONE_NEWTIME | CLONE_NEWUSER |
+ CLONE_NEWPID | CLONE_NEWCGROUP)))
return -EINVAL;
#ifndef CONFIG_USER_NS
@@ -290,6 +290,10 @@ static int check_setns_flags(unsigned long flags)
if (flags & CLONE_NEWNET)
return -EINVAL;
#endif
+#ifndef CONFIG_TIME_NS
+ if (flags & CLONE_NEWTIME)
+ return -EINVAL;
+#endif
return 0;
}
@@ -464,6 +468,14 @@ static int validate_nsset(struct nsset *nsset, struct pid *pid)
}
#endif
+#ifdef CONFIG_TIME_NS
+ if (flags & CLONE_NEWTIME) {
+ ret = validate_ns(nsset, &nsp->time_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
out:
if (pid_ns)
put_pid_ns(pid_ns);
@@ -507,6 +519,11 @@ static void commit_nsset(struct nsset *nsset)
exit_sem(me);
#endif
+#ifdef CONFIG_TIME_NS
+ if (flags & CLONE_NEWTIME)
+ timens_commit(me, nsset->nsproxy->time_ns);
+#endif
+
/* transfer ownership */
switch_task_namespaces(me, nsset->nsproxy);
nsset->nsproxy = NULL;
diff --git a/kernel/panic.c b/kernel/panic.c
index e2157ca387c8..aef8872ba843 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -505,7 +505,7 @@ static void do_oops_enter_exit(void)
* Return true if the calling CPU is allowed to print oops-related info.
* This is a bit racy..
*/
-int oops_may_print(void)
+bool oops_may_print(void)
{
return pause_on_oops_flag == 0;
}
@@ -551,7 +551,7 @@ static int init_oops_id(void)
}
late_initcall(init_oops_id);
-void print_oops_end_marker(void)
+static void print_oops_end_marker(void)
{
init_oops_id();
pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id);
diff --git a/kernel/pid.c b/kernel/pid.c
index f1496b757162..b2562a7ce525 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -42,6 +42,7 @@
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/idr.h>
+#include <net/sock.h>
struct pid init_struct_pid = {
.count = REFCOUNT_INIT(1),
@@ -198,7 +199,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
if (tid != 1 && !tmp->child_reaper)
goto out_free;
retval = -EPERM;
- if (!ns_capable(tmp->user_ns, CAP_SYS_ADMIN))
+ if (!checkpoint_restore_ns_capable(tmp->user_ns))
goto out_free;
set_tid_size--;
}
@@ -635,17 +636,8 @@ static int pidfd_getfd(struct pid *pid, int fd)
if (IS_ERR(file))
return PTR_ERR(file);
- ret = security_file_receive(file);
- if (ret) {
- fput(file);
- return ret;
- }
-
- ret = get_unused_fd_flags(O_CLOEXEC);
- if (ret < 0)
- fput(file);
- else
- fd_install(ret, file);
+ ret = receive_fd(file, O_CLOEXEC);
+ fput(file);
return ret;
}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 0e5ac162c3a8..ac135bd600eb 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -269,7 +269,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
struct ctl_table tmp = *table;
int ret, next;
- if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
+ if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns))
return -EPERM;
/*
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 0a9326f5f421..c1ff7fa030ab 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -1,9 +1,10 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Energy Model of CPUs
+ * Energy Model of devices
*
- * Copyright (c) 2018, Arm ltd.
+ * Copyright (c) 2018-2020, Arm ltd.
* Written by: Quentin Perret, Arm ltd.
+ * Improvements provided by: Lukasz Luba, Arm ltd.
*/
#define pr_fmt(fmt) "energy_model: " fmt
@@ -15,30 +16,32 @@
#include <linux/sched/topology.h>
#include <linux/slab.h>
-/* Mapping of each CPU to the performance domain to which it belongs. */
-static DEFINE_PER_CPU(struct em_perf_domain *, em_data);
-
/*
* Mutex serializing the registrations of performance domains and letting
* callbacks defined by drivers sleep.
*/
static DEFINE_MUTEX(em_pd_mutex);
+static bool _is_cpu_device(struct device *dev)
+{
+ return (dev->bus == &cpu_subsys);
+}
+
#ifdef CONFIG_DEBUG_FS
static struct dentry *rootdir;
-static void em_debug_create_cs(struct em_cap_state *cs, struct dentry *pd)
+static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd)
{
struct dentry *d;
char name[24];
- snprintf(name, sizeof(name), "cs:%lu", cs->frequency);
+ snprintf(name, sizeof(name), "ps:%lu", ps->frequency);
- /* Create per-cs directory */
+ /* Create per-ps directory */
d = debugfs_create_dir(name, pd);
- debugfs_create_ulong("frequency", 0444, d, &cs->frequency);
- debugfs_create_ulong("power", 0444, d, &cs->power);
- debugfs_create_ulong("cost", 0444, d, &cs->cost);
+ debugfs_create_ulong("frequency", 0444, d, &ps->frequency);
+ debugfs_create_ulong("power", 0444, d, &ps->power);
+ debugfs_create_ulong("cost", 0444, d, &ps->cost);
}
static int em_debug_cpus_show(struct seq_file *s, void *unused)
@@ -49,22 +52,30 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused)
}
DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
-static void em_debug_create_pd(struct em_perf_domain *pd, int cpu)
+static void em_debug_create_pd(struct device *dev)
{
struct dentry *d;
- char name[8];
int i;
- snprintf(name, sizeof(name), "pd%d", cpu);
-
/* Create the directory of the performance domain */
- d = debugfs_create_dir(name, rootdir);
+ d = debugfs_create_dir(dev_name(dev), rootdir);
- debugfs_create_file("cpus", 0444, d, pd->cpus, &em_debug_cpus_fops);
+ if (_is_cpu_device(dev))
+ debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus,
+ &em_debug_cpus_fops);
+
+ /* Create a sub-directory for each performance state */
+ for (i = 0; i < dev->em_pd->nr_perf_states; i++)
+ em_debug_create_ps(&dev->em_pd->table[i], d);
- /* Create a sub-directory for each capacity state */
- for (i = 0; i < pd->nr_cap_states; i++)
- em_debug_create_cs(&pd->table[i], d);
+}
+
+static void em_debug_remove_pd(struct device *dev)
+{
+ struct dentry *debug_dir;
+
+ debug_dir = debugfs_lookup(dev_name(dev), rootdir);
+ debugfs_remove_recursive(debug_dir);
}
static int __init em_debug_init(void)
@@ -76,58 +87,55 @@ static int __init em_debug_init(void)
}
core_initcall(em_debug_init);
#else /* CONFIG_DEBUG_FS */
-static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) {}
+static void em_debug_create_pd(struct device *dev) {}
+static void em_debug_remove_pd(struct device *dev) {}
#endif
-static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
- struct em_data_callback *cb)
+
+static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
+ int nr_states, struct em_data_callback *cb)
{
unsigned long opp_eff, prev_opp_eff = ULONG_MAX;
unsigned long power, freq, prev_freq = 0;
- int i, ret, cpu = cpumask_first(span);
- struct em_cap_state *table;
- struct em_perf_domain *pd;
+ struct em_perf_state *table;
+ int i, ret;
u64 fmax;
- if (!cb->active_power)
- return NULL;
-
- pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
- if (!pd)
- return NULL;
-
table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL);
if (!table)
- goto free_pd;
+ return -ENOMEM;
- /* Build the list of capacity states for this performance domain */
+ /* Build the list of performance states for this performance domain */
for (i = 0, freq = 0; i < nr_states; i++, freq++) {
/*
* active_power() is a driver callback which ceils 'freq' to
- * lowest capacity state of 'cpu' above 'freq' and updates
+ * lowest performance state of 'dev' above 'freq' and updates
* 'power' and 'freq' accordingly.
*/
- ret = cb->active_power(&power, &freq, cpu);
+ ret = cb->active_power(&power, &freq, dev);
if (ret) {
- pr_err("pd%d: invalid cap. state: %d\n", cpu, ret);
- goto free_cs_table;
+ dev_err(dev, "EM: invalid perf. state: %d\n",
+ ret);
+ goto free_ps_table;
}
/*
* We expect the driver callback to increase the frequency for
- * higher capacity states.
+ * higher performance states.
*/
if (freq <= prev_freq) {
- pr_err("pd%d: non-increasing freq: %lu\n", cpu, freq);
- goto free_cs_table;
+ dev_err(dev, "EM: non-increasing freq: %lu\n",
+ freq);
+ goto free_ps_table;
}
/*
* The power returned by active_state() is expected to be
* positive, in milli-watts and to fit into 16 bits.
*/
- if (!power || power > EM_CPU_MAX_POWER) {
- pr_err("pd%d: invalid power: %lu\n", cpu, power);
- goto free_cs_table;
+ if (!power || power > EM_MAX_POWER) {
+ dev_err(dev, "EM: invalid power: %lu\n",
+ power);
+ goto free_ps_table;
}
table[i].power = power;
@@ -141,12 +149,12 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
*/
opp_eff = freq / power;
if (opp_eff >= prev_opp_eff)
- pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_cap_state %d >= em_cap_state%d\n",
- cpu, i, i - 1);
+ dev_dbg(dev, "EM: hertz/watts ratio non-monotonically decreasing: em_perf_state %d >= em_perf_state%d\n",
+ i, i - 1);
prev_opp_eff = opp_eff;
}
- /* Compute the cost of each capacity_state. */
+ /* Compute the cost of each performance state. */
fmax = (u64) table[nr_states - 1].frequency;
for (i = 0; i < nr_states; i++) {
table[i].cost = div64_u64(fmax * table[i].power,
@@ -154,39 +162,94 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
}
pd->table = table;
- pd->nr_cap_states = nr_states;
- cpumask_copy(to_cpumask(pd->cpus), span);
-
- em_debug_create_pd(pd, cpu);
+ pd->nr_perf_states = nr_states;
- return pd;
+ return 0;
-free_cs_table:
+free_ps_table:
kfree(table);
-free_pd:
- kfree(pd);
+ return -EINVAL;
+}
+
+static int em_create_pd(struct device *dev, int nr_states,
+ struct em_data_callback *cb, cpumask_t *cpus)
+{
+ struct em_perf_domain *pd;
+ struct device *cpu_dev;
+ int cpu, ret;
+
+ if (_is_cpu_device(dev)) {
+ pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
+ if (!pd)
+ return -ENOMEM;
+
+ cpumask_copy(em_span_cpus(pd), cpus);
+ } else {
+ pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+ if (!pd)
+ return -ENOMEM;
+ }
+
+ ret = em_create_perf_table(dev, pd, nr_states, cb);
+ if (ret) {
+ kfree(pd);
+ return ret;
+ }
+
+ if (_is_cpu_device(dev))
+ for_each_cpu(cpu, cpus) {
+ cpu_dev = get_cpu_device(cpu);
+ cpu_dev->em_pd = pd;
+ }
+
+ dev->em_pd = pd;
+
+ return 0;
+}
+
+/**
+ * em_pd_get() - Return the performance domain for a device
+ * @dev : Device to find the performance domain for
+ *
+ * Returns the performance domain to which @dev belongs, or NULL if it doesn't
+ * exist.
+ */
+struct em_perf_domain *em_pd_get(struct device *dev)
+{
+ if (IS_ERR_OR_NULL(dev))
+ return NULL;
- return NULL;
+ return dev->em_pd;
}
+EXPORT_SYMBOL_GPL(em_pd_get);
/**
* em_cpu_get() - Return the performance domain for a CPU
* @cpu : CPU to find the performance domain for
*
- * Return: the performance domain to which 'cpu' belongs, or NULL if it doesn't
+ * Returns the performance domain to which @cpu belongs, or NULL if it doesn't
* exist.
*/
struct em_perf_domain *em_cpu_get(int cpu)
{
- return READ_ONCE(per_cpu(em_data, cpu));
+ struct device *cpu_dev;
+
+ cpu_dev = get_cpu_device(cpu);
+ if (!cpu_dev)
+ return NULL;
+
+ return em_pd_get(cpu_dev);
}
EXPORT_SYMBOL_GPL(em_cpu_get);
/**
- * em_register_perf_domain() - Register the Energy Model of a performance domain
- * @span : Mask of CPUs in the performance domain
- * @nr_states : Number of capacity states to register
+ * em_dev_register_perf_domain() - Register the Energy Model (EM) for a device
+ * @dev : Device for which the EM is to register
+ * @nr_states : Number of performance states to register
* @cb : Callback functions providing the data of the Energy Model
+ * @cpus : Pointer to cpumask_t, which in case of a CPU device is
+ * obligatory. It can be taken from i.e. 'policy->cpus'. For other
+ * type of devices this should be set to NULL.
*
* Create Energy Model tables for a performance domain using the callbacks
* defined in cb.
@@ -196,14 +259,13 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
*
* Return 0 on success
*/
-int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
- struct em_data_callback *cb)
+int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
+ struct em_data_callback *cb, cpumask_t *cpus)
{
unsigned long cap, prev_cap = 0;
- struct em_perf_domain *pd;
- int cpu, ret = 0;
+ int cpu, ret;
- if (!span || !nr_states || !cb)
+ if (!dev || !nr_states || !cb)
return -EINVAL;
/*
@@ -212,47 +274,79 @@ int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
*/
mutex_lock(&em_pd_mutex);
- for_each_cpu(cpu, span) {
- /* Make sure we don't register again an existing domain. */
- if (READ_ONCE(per_cpu(em_data, cpu))) {
- ret = -EEXIST;
- goto unlock;
- }
+ if (dev->em_pd) {
+ ret = -EEXIST;
+ goto unlock;
+ }
- /*
- * All CPUs of a domain must have the same micro-architecture
- * since they all share the same table.
- */
- cap = arch_scale_cpu_capacity(cpu);
- if (prev_cap && prev_cap != cap) {
- pr_err("CPUs of %*pbl must have the same capacity\n",
- cpumask_pr_args(span));
+ if (_is_cpu_device(dev)) {
+ if (!cpus) {
+ dev_err(dev, "EM: invalid CPU mask\n");
ret = -EINVAL;
goto unlock;
}
- prev_cap = cap;
+
+ for_each_cpu(cpu, cpus) {
+ if (em_cpu_get(cpu)) {
+ dev_err(dev, "EM: exists for CPU%d\n", cpu);
+ ret = -EEXIST;
+ goto unlock;
+ }
+ /*
+ * All CPUs of a domain must have the same
+ * micro-architecture since they all share the same
+ * table.
+ */
+ cap = arch_scale_cpu_capacity(cpu);
+ if (prev_cap && prev_cap != cap) {
+ dev_err(dev, "EM: CPUs of %*pbl must have the same capacity\n",
+ cpumask_pr_args(cpus));
+
+ ret = -EINVAL;
+ goto unlock;
+ }
+ prev_cap = cap;
+ }
}
- /* Create the performance domain and add it to the Energy Model. */
- pd = em_create_pd(span, nr_states, cb);
- if (!pd) {
- ret = -EINVAL;
+ ret = em_create_pd(dev, nr_states, cb, cpus);
+ if (ret)
goto unlock;
- }
- for_each_cpu(cpu, span) {
- /*
- * The per-cpu array can be read concurrently from em_cpu_get().
- * The barrier enforces the ordering needed to make sure readers
- * can only access well formed em_perf_domain structs.
- */
- smp_store_release(per_cpu_ptr(&em_data, cpu), pd);
- }
+ em_debug_create_pd(dev);
+ dev_info(dev, "EM: created perf domain\n");
- pr_debug("Created perf domain %*pbl\n", cpumask_pr_args(span));
unlock:
mutex_unlock(&em_pd_mutex);
-
return ret;
}
-EXPORT_SYMBOL_GPL(em_register_perf_domain);
+EXPORT_SYMBOL_GPL(em_dev_register_perf_domain);
+
+/**
+ * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device
+ * @dev : Device for which the EM is registered
+ *
+ * Unregister the EM for the specified @dev (but not a CPU device).
+ */
+void em_dev_unregister_perf_domain(struct device *dev)
+{
+ if (IS_ERR_OR_NULL(dev) || !dev->em_pd)
+ return;
+
+ if (_is_cpu_device(dev))
+ return;
+
+ /*
+ * The mutex separates all register/unregister requests and protects
+ * from potential clean-up/setup issues in the debugfs directories.
+ * The debugfs directory name is the same as device's name.
+ */
+ mutex_lock(&em_pd_mutex);
+ em_debug_remove_pd(dev);
+
+ kfree(dev->em_pd->table);
+ kfree(dev->em_pd);
+ dev->em_pd = NULL;
+ mutex_unlock(&em_pd_mutex);
+}
+EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 02ec716a4927..f33769f97aca 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -795,6 +795,103 @@ int hibernate(void)
return error;
}
+/**
+ * hibernate_quiet_exec - Execute a function with all devices frozen.
+ * @func: Function to execute.
+ * @data: Data pointer to pass to @func.
+ *
+ * Return the @func return value or an error code if it cannot be executed.
+ */
+int hibernate_quiet_exec(int (*func)(void *data), void *data)
+{
+ int error, nr_calls = 0;
+
+ lock_system_sleep();
+
+ if (!hibernate_acquire()) {
+ error = -EBUSY;
+ goto unlock;
+ }
+
+ pm_prepare_console();
+
+ error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
+ if (error) {
+ nr_calls--;
+ goto exit;
+ }
+
+ error = freeze_processes();
+ if (error)
+ goto exit;
+
+ lock_device_hotplug();
+
+ pm_suspend_clear_flags();
+
+ error = platform_begin(true);
+ if (error)
+ goto thaw;
+
+ error = freeze_kernel_threads();
+ if (error)
+ goto thaw;
+
+ error = dpm_prepare(PMSG_FREEZE);
+ if (error)
+ goto dpm_complete;
+
+ suspend_console();
+
+ error = dpm_suspend(PMSG_FREEZE);
+ if (error)
+ goto dpm_resume;
+
+ error = dpm_suspend_end(PMSG_FREEZE);
+ if (error)
+ goto dpm_resume;
+
+ error = platform_pre_snapshot(true);
+ if (error)
+ goto skip;
+
+ error = func(data);
+
+skip:
+ platform_finish(true);
+
+ dpm_resume_start(PMSG_THAW);
+
+dpm_resume:
+ dpm_resume(PMSG_THAW);
+
+ resume_console();
+
+dpm_complete:
+ dpm_complete(PMSG_THAW);
+
+ thaw_kernel_threads();
+
+thaw:
+ platform_end(true);
+
+ unlock_device_hotplug();
+
+ thaw_processes();
+
+exit:
+ __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL);
+
+ pm_restore_console();
+
+ hibernate_release();
+
+unlock:
+ unlock_system_sleep();
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(hibernate_quiet_exec);
/**
* software_resume - Resume from a saved hibernation image.
@@ -1062,7 +1159,7 @@ power_attr(disk);
static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
+ return sprintf(buf, "%d:%d\n", MAJOR(swsusp_resume_device),
MINOR(swsusp_resume_device));
}
@@ -1162,7 +1259,7 @@ static ssize_t reserved_size_store(struct kobject *kobj,
power_attr(reserved_size);
-static struct attribute * g[] = {
+static struct attribute *g[] = {
&disk_attr.attr,
&resume_offset_attr.attr,
&resume_attr.attr,
@@ -1190,7 +1287,7 @@ static int __init resume_setup(char *str)
if (noresume)
return 1;
- strncpy( resume_file, str, 255 );
+ strncpy(resume_file, str, 255);
return 1;
}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index ba2094db6294..32fc89ac96c3 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -32,7 +32,7 @@ static inline int init_header_complete(struct swsusp_info *info)
return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE);
}
-static inline char *check_image_kernel(struct swsusp_info *info)
+static inline const char *check_image_kernel(struct swsusp_info *info)
{
return arch_hibernation_header_restore(info) ?
"architecture specific data" : NULL;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 881128b9351e..d25749bce7cf 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1663,7 +1663,7 @@ static unsigned long minimum_image_size(unsigned long saveable)
{
unsigned long size;
- size = global_node_page_state(NR_SLAB_RECLAIMABLE)
+ size = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B)
+ global_node_page_state(NR_ACTIVE_ANON)
+ global_node_page_state(NR_INACTIVE_ANON)
+ global_node_page_state(NR_ACTIVE_FILE)
@@ -2023,7 +2023,7 @@ static int init_header_complete(struct swsusp_info *info)
return 0;
}
-static char *check_image_kernel(struct swsusp_info *info)
+static const char *check_image_kernel(struct swsusp_info *info)
{
if (info->version_code != LINUX_VERSION_CODE)
return "kernel version";
@@ -2176,7 +2176,7 @@ static void mark_unsafe_pages(struct memory_bitmap *bm)
static int check_header(struct swsusp_info *info)
{
- char *reason;
+ const char *reason;
reason = check_image_kernel(info);
if (!reason && info->num_physpages != get_num_physpages())
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index b71eaf5f5a86..9b75f6bfc333 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -943,6 +943,14 @@ out:
return ret;
}
+/*
+ * Be careful when modifying this function!!!
+ *
+ * Only few operations are supported because the device works only with the
+ * entire variable length messages (records). Non-standard values are
+ * returned in the other cases and has been this way for quite some time.
+ * User space applications might depend on this behavior.
+ */
static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
{
struct devkmsg_user *user = file->private_data;
@@ -2658,7 +2666,7 @@ early_param("keep_bootcon", keep_bootcon_setup);
static int try_enable_new_console(struct console *newcon, bool user_specified)
{
struct console_cmdline *c;
- int i;
+ int i, err;
for (i = 0, c = console_cmdline;
i < MAX_CMDLINECONSOLES && c->name[0];
@@ -2681,8 +2689,8 @@ static int try_enable_new_console(struct console *newcon, bool user_specified)
return 0;
if (newcon->setup &&
- newcon->setup(newcon, c->options) != 0)
- return -EIO;
+ (err = newcon->setup(newcon, c->options)) != 0)
+ return err;
}
newcon->flags |= CON_ENABLED;
if (i == preferred_console) {
@@ -2695,7 +2703,7 @@ static int try_enable_new_console(struct console *newcon, bool user_specified)
/*
* Some consoles, such as pstore and netconsole, can be enabled even
* without matching. Accept the pre-enabled consoles only when match()
- * and setup() had a change to be called.
+ * and setup() had a chance to be called.
*/
if (newcon->flags & CON_ENABLED && c->user_specified == user_specified)
return 0;
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index ec903d781778..21448d3374e2 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -361,7 +361,6 @@ rcu_perf_writer(void *arg)
int i_max;
long me = (long)arg;
struct rcu_head *rhp = NULL;
- struct sched_param sp;
bool started = false, done = false, alldone = false;
u64 t;
u64 *wdp;
@@ -370,8 +369,7 @@ rcu_perf_writer(void *arg)
VERBOSE_PERFOUT_STRING("rcu_perf_writer task started");
WARN_ON(!wdpp);
set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
- sp.sched_priority = 1;
- sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+ sched_set_fifo_low(current);
if (holdoff)
schedule_timeout_uninterruptible(holdoff * HZ);
@@ -427,9 +425,7 @@ retry:
started = true;
if (!done && i >= MIN_MEAS) {
done = true;
- sp.sched_priority = 0;
- sched_setscheduler_nocheck(current,
- SCHED_NORMAL, &sp);
+ sched_set_normal(current, 0);
pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n",
perf_type, PERF_FLAG, me, MIN_MEAS);
if (atomic_inc_return(&n_rcu_perf_writer_finished) >=
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index d0d265304d14..f453bf8d2f1e 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -895,16 +895,11 @@ static int rcu_torture_boost(void *arg)
unsigned long endtime;
unsigned long oldstarttime;
struct rcu_boost_inflight rbi = { .inflight = 0 };
- struct sched_param sp;
VERBOSE_TOROUT_STRING("rcu_torture_boost started");
/* Set real-time priority. */
- sp.sched_priority = 1;
- if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
- VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!");
- n_rcu_torture_boost_rterror++;
- }
+ sched_set_fifo_low(current);
init_rcu_head_on_stack(&rbi.rcu);
/* Each pass through the following loop does one boost-test cycle. */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index ac7198ed3197..8ce77d9ac716 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -59,6 +59,7 @@
#include <linux/sched/clock.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
+#include <linux/kasan.h>
#include "../time/tick-internal.h"
#include "tree.h"
@@ -2890,6 +2891,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
head->func = func;
head->next = NULL;
local_irq_save(flags);
+ kasan_record_aux_stack(head);
rdp = this_cpu_ptr(&rcu_data);
/* Add the callback to our list. */
diff --git a/kernel/regset.c b/kernel/regset.c
new file mode 100644
index 000000000000..586823786f39
--- /dev/null
+++ b/kernel/regset.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/regset.h>
+
+static int __regset_get(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int size,
+ void **data)
+{
+ void *p = *data, *to_free = NULL;
+ int res;
+
+ if (!regset->regset_get)
+ return -EOPNOTSUPP;
+ if (size > regset->n * regset->size)
+ size = regset->n * regset->size;
+ if (!p) {
+ to_free = p = kzalloc(size, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+ }
+ res = regset->regset_get(target, regset,
+ (struct membuf){.p = p, .left = size});
+ if (res < 0) {
+ kfree(to_free);
+ return res;
+ }
+ *data = p;
+ return size - res;
+}
+
+int regset_get(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int size,
+ void *data)
+{
+ return __regset_get(target, regset, size, &data);
+}
+EXPORT_SYMBOL(regset_get);
+
+int regset_get_alloc(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int size,
+ void **data)
+{
+ *data = NULL;
+ return __regset_get(target, regset, size, data);
+}
+EXPORT_SYMBOL(regset_get_alloc);
+
+/**
+ * copy_regset_to_user - fetch a thread's user_regset data into user memory
+ * @target: thread to be examined
+ * @view: &struct user_regset_view describing user thread machine state
+ * @setno: index in @view->regsets
+ * @offset: offset into the regset data, in bytes
+ * @size: amount of data to copy, in bytes
+ * @data: user-mode pointer to copy into
+ */
+int copy_regset_to_user(struct task_struct *target,
+ const struct user_regset_view *view,
+ unsigned int setno,
+ unsigned int offset, unsigned int size,
+ void __user *data)
+{
+ const struct user_regset *regset = &view->regsets[setno];
+ void *buf;
+ int ret;
+
+ ret = regset_get_alloc(target, regset, size, &buf);
+ if (ret > 0)
+ ret = copy_to_user(data, buf, ret) ? -EFAULT : 0;
+ kfree(buf);
+ return ret;
+}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4a0e7b449b88..8471a0f7eb32 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5496,6 +5496,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
* @policy: new policy.
* @param: structure containing the new RT priority.
*
+ * Use sched_set_fifo(), read its comment.
+ *
* Return: 0 on success. An error code otherwise.
*
* NOTE that the task may be already dead.
@@ -5505,13 +5507,11 @@ int sched_setscheduler(struct task_struct *p, int policy,
{
return _sched_setscheduler(p, policy, param, true);
}
-EXPORT_SYMBOL_GPL(sched_setscheduler);
int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
{
return __sched_setscheduler(p, attr, true, true);
}
-EXPORT_SYMBOL_GPL(sched_setattr);
int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
{
@@ -5536,7 +5536,51 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy,
{
return _sched_setscheduler(p, policy, param, false);
}
-EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
+
+/*
+ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
+ * incapable of resource management, which is the one thing an OS really should
+ * be doing.
+ *
+ * This is of course the reason it is limited to privileged users only.
+ *
+ * Worse still; it is fundamentally impossible to compose static priority
+ * workloads. You cannot take two correctly working static prio workloads
+ * and smash them together and still expect them to work.
+ *
+ * For this reason 'all' FIFO tasks the kernel creates are basically at:
+ *
+ * MAX_RT_PRIO / 2
+ *
+ * The administrator _MUST_ configure the system, the kernel simply doesn't
+ * know enough information to make a sensible choice.
+ */
+void sched_set_fifo(struct task_struct *p)
+{
+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_fifo);
+
+/*
+ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
+ */
+void sched_set_fifo_low(struct task_struct *p)
+{
+ struct sched_param sp = { .sched_priority = 1 };
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_fifo_low);
+
+void sched_set_normal(struct task_struct *p, int nice)
+{
+ struct sched_attr attr = {
+ .sched_policy = SCHED_NORMAL,
+ .sched_nice = nice,
+ };
+ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_normal);
static int
do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
@@ -6387,10 +6431,10 @@ void sched_show_task(struct task_struct *p)
if (!try_get_task_stack(p))
return;
- printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
+ pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
if (p->state == TASK_RUNNING)
- printk(KERN_CONT " running task ");
+ pr_cont(" running task ");
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
#endif
@@ -6399,8 +6443,8 @@ void sched_show_task(struct task_struct *p)
if (pid_alive(p))
ppid = task_pid_nr(rcu_dereference(p->real_parent));
rcu_read_unlock();
- printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
- task_pid_nr(p), ppid,
+ pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
+ free, task_pid_nr(p), ppid,
(unsigned long)task_thread_info(p)->flags);
print_worker_info(KERN_INFO, p);
@@ -6435,13 +6479,6 @@ void show_state_filter(unsigned long state_filter)
{
struct task_struct *g, *p;
-#if BITS_PER_LONG == 32
- printk(KERN_INFO
- " task PC stack pid father\n");
-#else
- printk(KERN_INFO
- " task PC stack pid father\n");
-#endif
rcu_read_lock();
for_each_process_thread(g, p) {
/*
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index dc6835bc6490..e39008242cf4 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -909,11 +909,7 @@ struct cpufreq_governor *cpufreq_default_governor(void)
}
#endif
-static int __init sugov_register(void)
-{
- return cpufreq_register_governor(&schedutil_gov);
-}
-core_initcall(sugov_register);
+cpufreq_governor_init(schedutil_gov);
#ifdef CONFIG_ENERGY_MODEL
extern bool sched_energy_update;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2ba8f230feb9..1a68a0536add 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6509,7 +6509,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
max_util = max(max_util, cpu_util);
}
- return em_pd_energy(pd->em_pd, max_util, sum_util);
+ return em_cpu_energy(pd->em_pd, max_util, sum_util);
}
/*
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index e53b711bd643..967732c0766c 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -616,11 +616,8 @@ out:
static int psi_poll_worker(void *data)
{
struct psi_group *group = (struct psi_group *)data;
- struct sched_param param = {
- .sched_priority = 1,
- };
- sched_setscheduler_nocheck(current, SCHED_FIFO, &param);
+ sched_set_fifo_low(current);
while (true) {
wait_event_interruptible(group->poll_wait,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3fd283892761..28709f6b0975 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1999,7 +1999,7 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
{
rq->nr_running -= count;
if (trace_sched_update_nr_running_tp_enabled()) {
- call_trace_sched_update_nr_running(rq, count);
+ call_trace_sched_update_nr_running(rq, -count);
}
/* Check if we still need preemption */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 9079d865a935..007b0a6b0152 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -272,10 +272,10 @@ static void perf_domain_debug(const struct cpumask *cpu_map,
printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
while (pd) {
- printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }",
+ printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }",
cpumask_first(perf_domain_span(pd)),
cpumask_pr_args(perf_domain_span(pd)),
- em_pd_nr_cap_states(pd->em_pd));
+ em_pd_nr_perf_states(pd->em_pd));
pd = pd->next;
}
@@ -313,26 +313,26 @@ static void sched_energy_set(bool has_eas)
*
* The complexity of the Energy Model is defined as:
*
- * C = nr_pd * (nr_cpus + nr_cs)
+ * C = nr_pd * (nr_cpus + nr_ps)
*
* with parameters defined as:
* - nr_pd: the number of performance domains
* - nr_cpus: the number of CPUs
- * - nr_cs: the sum of the number of capacity states of all performance
+ * - nr_ps: the sum of the number of performance states of all performance
* domains (for example, on a system with 2 performance domains,
- * with 10 capacity states each, nr_cs = 2 * 10 = 20).
+ * with 10 performance states each, nr_ps = 2 * 10 = 20).
*
* It is generally not a good idea to use such a model in the wake-up path on
* very complex platforms because of the associated scheduling overheads. The
* arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
- * with per-CPU DVFS and less than 8 capacity states each, for example.
+ * with per-CPU DVFS and less than 8 performance states each, for example.
*/
#define EM_MAX_COMPLEXITY 2048
extern struct cpufreq_governor schedutil_gov;
static bool build_perf_domains(const struct cpumask *cpu_map)
{
- int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map);
+ int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map);
struct perf_domain *pd = NULL, *tmp;
int cpu = cpumask_first(cpu_map);
struct root_domain *rd = cpu_rq(cpu)->rd;
@@ -384,15 +384,15 @@ static bool build_perf_domains(const struct cpumask *cpu_map)
pd = tmp;
/*
- * Count performance domains and capacity states for the
+ * Count performance domains and performance states for the
* complexity check.
*/
nr_pd++;
- nr_cs += em_pd_nr_cap_states(pd->em_pd);
+ nr_ps += em_pd_nr_perf_states(pd->em_pd);
}
/* Bail out if the Energy Model complexity is too high. */
- if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) {
+ if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) {
WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
cpumask_pr_args(cpu_map));
goto free;
diff --git a/kernel/scs.c b/kernel/scs.c
index 5d4d9bbdec36..4ff4a7ba0094 100644
--- a/kernel/scs.c
+++ b/kernel/scs.c
@@ -17,7 +17,7 @@ static void __scs_account(void *s, int account)
{
struct page *scs_page = virt_to_page(s);
- mod_zone_page_state(page_zone(scs_page), NR_KERNEL_SCS_KB,
+ mod_node_page_state(page_pgdat(scs_page), NR_KERNEL_SCS_KB,
account * (SCS_SIZE / SZ_1K));
}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index d653d8426de9..3ee59ce0a323 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -13,6 +13,7 @@
* Mode 2 allows user-defined system call filters in the form
* of Berkeley Packet Filters/Linux Socket Filters.
*/
+#define pr_fmt(fmt) "seccomp: " fmt
#include <linux/refcount.h>
#include <linux/audit.h>
@@ -41,6 +42,15 @@
#include <linux/tracehook.h>
#include <linux/uaccess.h>
#include <linux/anon_inodes.h>
+#include <linux/lockdep.h>
+
+/*
+ * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the
+ * wrong direction flag in the ioctl number. This is the broken one,
+ * which the kernel needs to keep supporting until all userspaces stop
+ * using the wrong command number.
+ */
+#define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR SECCOMP_IOR(2, __u64)
enum notify_state {
SECCOMP_NOTIFY_INIT,
@@ -77,10 +87,42 @@ struct seccomp_knotif {
long val;
u32 flags;
- /* Signals when this has entered SECCOMP_NOTIFY_REPLIED */
+ /*
+ * Signals when this has changed states, such as the listener
+ * dying, a new seccomp addfd message, or changing to REPLIED
+ */
struct completion ready;
struct list_head list;
+
+ /* outstanding addfd requests */
+ struct list_head addfd;
+};
+
+/**
+ * struct seccomp_kaddfd - container for seccomp_addfd ioctl messages
+ *
+ * @file: A reference to the file to install in the other task
+ * @fd: The fd number to install it at. If the fd number is -1, it means the
+ * installing process should allocate the fd as normal.
+ * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC
+ * is allowed.
+ * @ret: The return value of the installing process. It is set to the fd num
+ * upon success (>= 0).
+ * @completion: Indicates that the installing process has completed fd
+ * installation, or gone away (either due to successful
+ * reply, or signal)
+ *
+ */
+struct seccomp_kaddfd {
+ struct file *file;
+ int fd;
+ unsigned int flags;
+
+ /* To only be set on reply */
+ int ret;
+ struct completion completion;
+ struct list_head list;
};
/**
@@ -94,27 +136,35 @@ struct seccomp_knotif {
* filter->notify_lock.
* @next_id: The id of the next request.
* @notifications: A list of struct seccomp_knotif elements.
- * @wqh: A wait queue for poll.
*/
struct notification {
struct semaphore request;
u64 next_id;
struct list_head notifications;
- wait_queue_head_t wqh;
};
/**
* struct seccomp_filter - container for seccomp BPF programs
*
- * @usage: reference count to manage the object lifetime.
- * get/put helpers should be used when accessing an instance
- * outside of a lifetime-guarded section. In general, this
- * is only needed for handling filters shared across tasks.
+ * @refs: Reference count to manage the object lifetime.
+ * A filter's reference count is incremented for each directly
+ * attached task, once for the dependent filter, and if
+ * requested for the user notifier. When @refs reaches zero,
+ * the filter can be freed.
+ * @users: A filter's @users count is incremented for each directly
+ * attached task (filter installation, fork(), thread_sync),
+ * and once for the dependent filter (tracked in filter->prev).
+ * When it reaches zero it indicates that no direct or indirect
+ * users of that filter exist. No new tasks can get associated with
+ * this filter after reaching 0. The @users count is always smaller
+ * or equal to @refs. Hence, reaching 0 for @users does not mean
+ * the filter can be freed.
* @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
* @prev: points to a previously installed, or inherited, filter
* @prog: the BPF program to evaluate
* @notif: the struct that holds all notification related information
* @notify_lock: A lock for all notification-related accesses.
+ * @wqh: A wait queue for poll if a notifier is in use.
*
* seccomp_filter objects are organized in a tree linked via the @prev
* pointer. For any task, it appears to be a singly-linked list starting
@@ -124,15 +174,17 @@ struct notification {
* how namespaces work.
*
* seccomp_filter objects should never be modified after being attached
- * to a task_struct (other than @usage).
+ * to a task_struct (other than @refs).
*/
struct seccomp_filter {
- refcount_t usage;
+ refcount_t refs;
+ refcount_t users;
bool log;
struct seccomp_filter *prev;
struct bpf_prog *prog;
struct notification *notif;
struct mutex notify_lock;
+ wait_queue_head_t wqh;
};
/* Limit any path through the tree to 256KB worth of instructions. */
@@ -366,6 +418,59 @@ static inline pid_t seccomp_can_sync_threads(void)
return 0;
}
+static inline void seccomp_filter_free(struct seccomp_filter *filter)
+{
+ if (filter) {
+ bpf_prog_destroy(filter->prog);
+ kfree(filter);
+ }
+}
+
+static void __seccomp_filter_orphan(struct seccomp_filter *orig)
+{
+ while (orig && refcount_dec_and_test(&orig->users)) {
+ if (waitqueue_active(&orig->wqh))
+ wake_up_poll(&orig->wqh, EPOLLHUP);
+ orig = orig->prev;
+ }
+}
+
+static void __put_seccomp_filter(struct seccomp_filter *orig)
+{
+ /* Clean up single-reference branches iteratively. */
+ while (orig && refcount_dec_and_test(&orig->refs)) {
+ struct seccomp_filter *freeme = orig;
+ orig = orig->prev;
+ seccomp_filter_free(freeme);
+ }
+}
+
+static void __seccomp_filter_release(struct seccomp_filter *orig)
+{
+ /* Notify about any unused filters in the task's former filter tree. */
+ __seccomp_filter_orphan(orig);
+ /* Finally drop all references to the task's former tree. */
+ __put_seccomp_filter(orig);
+}
+
+/**
+ * seccomp_filter_release - Detach the task from its filter tree,
+ * drop its reference count, and notify
+ * about unused filters
+ *
+ * This function should only be called when the task is exiting as
+ * it detaches it from its filter tree. As such, READ_ONCE() and
+ * barriers are not needed here, as would normally be needed.
+ */
+void seccomp_filter_release(struct task_struct *tsk)
+{
+ struct seccomp_filter *orig = tsk->seccomp.filter;
+
+ /* Detach task from its filter tree. */
+ tsk->seccomp.filter = NULL;
+ __seccomp_filter_release(orig);
+}
+
/**
* seccomp_sync_threads: sets all threads to use current's filter
*
@@ -390,14 +495,19 @@ static inline void seccomp_sync_threads(unsigned long flags)
/* Get a task reference for the new leaf node. */
get_seccomp_filter(caller);
+
/*
* Drop the task reference to the shared ancestor since
* current's path will hold a reference. (This also
* allows a put before the assignment.)
*/
- put_seccomp_filter(thread);
+ __seccomp_filter_release(thread->seccomp.filter);
+
+ /* Make our new filter tree visible. */
smp_store_release(&thread->seccomp.filter,
caller->seccomp.filter);
+ atomic_set(&thread->seccomp.filter_count,
+ atomic_read(&thread->seccomp.filter_count));
/*
* Don't let an unprivileged task work around
@@ -461,7 +571,9 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
return ERR_PTR(ret);
}
- refcount_set(&sfilter->usage, 1);
+ refcount_set(&sfilter->refs, 1);
+ refcount_set(&sfilter->users, 1);
+ init_waitqueue_head(&sfilter->wqh);
return sfilter;
}
@@ -544,6 +656,7 @@ static long seccomp_attach_filter(unsigned int flags,
*/
filter->prev = current->seccomp.filter;
current->seccomp.filter = filter;
+ atomic_inc(&current->seccomp.filter_count);
/* Now that the new filter is in place, synchronize to all threads. */
if (flags & SECCOMP_FILTER_FLAG_TSYNC)
@@ -554,7 +667,7 @@ static long seccomp_attach_filter(unsigned int flags,
static void __get_seccomp_filter(struct seccomp_filter *filter)
{
- refcount_inc(&filter->usage);
+ refcount_inc(&filter->refs);
}
/* get_seccomp_filter - increments the reference count of the filter on @tsk */
@@ -564,30 +677,7 @@ void get_seccomp_filter(struct task_struct *tsk)
if (!orig)
return;
__get_seccomp_filter(orig);
-}
-
-static inline void seccomp_filter_free(struct seccomp_filter *filter)
-{
- if (filter) {
- bpf_prog_destroy(filter->prog);
- kfree(filter);
- }
-}
-
-static void __put_seccomp_filter(struct seccomp_filter *orig)
-{
- /* Clean up single-reference branches iteratively. */
- while (orig && refcount_dec_and_test(&orig->usage)) {
- struct seccomp_filter *freeme = orig;
- orig = orig->prev;
- seccomp_filter_free(freeme);
- }
-}
-
-/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
-void put_seccomp_filter(struct task_struct *tsk)
-{
- __put_seccomp_filter(tsk->seccomp.filter);
+ refcount_inc(&orig->users);
}
static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason)
@@ -684,20 +774,20 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
*/
static const int mode1_syscalls[] = {
__NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
- 0, /* null terminated */
+ -1, /* negative terminated */
};
static void __secure_computing_strict(int this_syscall)
{
- const int *syscall_whitelist = mode1_syscalls;
+ const int *allowed_syscalls = mode1_syscalls;
#ifdef CONFIG_COMPAT
if (in_compat_syscall())
- syscall_whitelist = get_compat_mode1_syscalls();
+ allowed_syscalls = get_compat_mode1_syscalls();
#endif
do {
- if (*syscall_whitelist == this_syscall)
+ if (*allowed_syscalls == this_syscall)
return;
- } while (*++syscall_whitelist);
+ } while (*++allowed_syscalls != -1);
#ifdef SECCOMP_DEBUG
dump_stack();
@@ -735,6 +825,17 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
return filter->notif->next_id++;
}
+static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd)
+{
+ /*
+ * Remove the notification, and reset the list pointers, indicating
+ * that it has been handled.
+ */
+ list_del_init(&addfd->list);
+ addfd->ret = receive_fd_replace(addfd->fd, addfd->file, addfd->flags);
+ complete(&addfd->completion);
+}
+
static int seccomp_do_user_notification(int this_syscall,
struct seccomp_filter *match,
const struct seccomp_data *sd)
@@ -743,6 +844,7 @@ static int seccomp_do_user_notification(int this_syscall,
u32 flags = 0;
long ret = 0;
struct seccomp_knotif n = {};
+ struct seccomp_kaddfd *addfd, *tmp;
mutex_lock(&match->notify_lock);
err = -ENOSYS;
@@ -755,25 +857,43 @@ static int seccomp_do_user_notification(int this_syscall,
n.id = seccomp_next_notify_id(match);
init_completion(&n.ready);
list_add(&n.list, &match->notif->notifications);
+ INIT_LIST_HEAD(&n.addfd);
up(&match->notif->request);
- wake_up_poll(&match->notif->wqh, EPOLLIN | EPOLLRDNORM);
+ wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
mutex_unlock(&match->notify_lock);
/*
* This is where we wait for a reply from userspace.
*/
+wait:
err = wait_for_completion_interruptible(&n.ready);
mutex_lock(&match->notify_lock);
if (err == 0) {
+ /* Check if we were woken up by a addfd message */
+ addfd = list_first_entry_or_null(&n.addfd,
+ struct seccomp_kaddfd, list);
+ if (addfd && n.state != SECCOMP_NOTIFY_REPLIED) {
+ seccomp_handle_addfd(addfd);
+ mutex_unlock(&match->notify_lock);
+ goto wait;
+ }
ret = n.val;
err = n.error;
flags = n.flags;
}
+ /* If there were any pending addfd calls, clear them out */
+ list_for_each_entry_safe(addfd, tmp, &n.addfd, list) {
+ /* The process went away before we got a chance to handle it */
+ addfd->ret = -ESRCH;
+ list_del_init(&addfd->list);
+ complete(&addfd->completion);
+ }
+
/*
* Note that it's possible the listener died in between the time when
- * we were notified of a respons (or a signal) and when we were able to
+ * we were notified of a response (or a signal) and when we were able to
* re-acquire the lock, so only delete from the list if the
* notification actually exists.
*
@@ -1011,6 +1131,11 @@ static int seccomp_notify_release(struct inode *inode, struct file *file)
knotif->error = -ENOSYS;
knotif->val = 0;
+ /*
+ * We do not need to wake up any pending addfd messages, as
+ * the notifier will do that for us, as this just looks
+ * like a standard reply.
+ */
complete(&knotif->ready);
}
@@ -1021,6 +1146,23 @@ static int seccomp_notify_release(struct inode *inode, struct file *file)
return 0;
}
+/* must be called with notif_lock held */
+static inline struct seccomp_knotif *
+find_notification(struct seccomp_filter *filter, u64 id)
+{
+ struct seccomp_knotif *cur;
+
+ lockdep_assert_held(&filter->notify_lock);
+
+ list_for_each_entry(cur, &filter->notif->notifications, list) {
+ if (cur->id == id)
+ return cur;
+ }
+
+ return NULL;
+}
+
+
static long seccomp_notify_recv(struct seccomp_filter *filter,
void __user *buf)
{
@@ -1064,7 +1206,7 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
unotif.data = *(knotif->data);
knotif->state = SECCOMP_NOTIFY_SENT;
- wake_up_poll(&filter->notif->wqh, EPOLLOUT | EPOLLWRNORM);
+ wake_up_poll(&filter->wqh, EPOLLOUT | EPOLLWRNORM);
ret = 0;
out:
mutex_unlock(&filter->notify_lock);
@@ -1078,15 +1220,8 @@ out:
* may have died when we released the lock, so we need to make
* sure it's still around.
*/
- knotif = NULL;
mutex_lock(&filter->notify_lock);
- list_for_each_entry(cur, &filter->notif->notifications, list) {
- if (cur->id == unotif.id) {
- knotif = cur;
- break;
- }
- }
-
+ knotif = find_notification(filter, unotif.id);
if (knotif) {
knotif->state = SECCOMP_NOTIFY_INIT;
up(&filter->notif->request);
@@ -1101,7 +1236,7 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
void __user *buf)
{
struct seccomp_notif_resp resp = {};
- struct seccomp_knotif *knotif = NULL, *cur;
+ struct seccomp_knotif *knotif;
long ret;
if (copy_from_user(&resp, buf, sizeof(resp)))
@@ -1118,13 +1253,7 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
if (ret < 0)
return ret;
- list_for_each_entry(cur, &filter->notif->notifications, list) {
- if (cur->id == resp.id) {
- knotif = cur;
- break;
- }
- }
-
+ knotif = find_notification(filter, resp.id);
if (!knotif) {
ret = -ENOENT;
goto out;
@@ -1150,7 +1279,7 @@ out:
static long seccomp_notify_id_valid(struct seccomp_filter *filter,
void __user *buf)
{
- struct seccomp_knotif *knotif = NULL;
+ struct seccomp_knotif *knotif;
u64 id;
long ret;
@@ -1161,17 +1290,109 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter,
if (ret < 0)
return ret;
- ret = -ENOENT;
- list_for_each_entry(knotif, &filter->notif->notifications, list) {
- if (knotif->id == id) {
- if (knotif->state == SECCOMP_NOTIFY_SENT)
- ret = 0;
- goto out;
- }
+ knotif = find_notification(filter, id);
+ if (knotif && knotif->state == SECCOMP_NOTIFY_SENT)
+ ret = 0;
+ else
+ ret = -ENOENT;
+
+ mutex_unlock(&filter->notify_lock);
+ return ret;
+}
+
+static long seccomp_notify_addfd(struct seccomp_filter *filter,
+ struct seccomp_notif_addfd __user *uaddfd,
+ unsigned int size)
+{
+ struct seccomp_notif_addfd addfd;
+ struct seccomp_knotif *knotif;
+ struct seccomp_kaddfd kaddfd;
+ int ret;
+
+ BUILD_BUG_ON(sizeof(addfd) < SECCOMP_NOTIFY_ADDFD_SIZE_VER0);
+ BUILD_BUG_ON(sizeof(addfd) != SECCOMP_NOTIFY_ADDFD_SIZE_LATEST);
+
+ if (size < SECCOMP_NOTIFY_ADDFD_SIZE_VER0 || size >= PAGE_SIZE)
+ return -EINVAL;
+
+ ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size);
+ if (ret)
+ return ret;
+
+ if (addfd.newfd_flags & ~O_CLOEXEC)
+ return -EINVAL;
+
+ if (addfd.flags & ~SECCOMP_ADDFD_FLAG_SETFD)
+ return -EINVAL;
+
+ if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD))
+ return -EINVAL;
+
+ kaddfd.file = fget(addfd.srcfd);
+ if (!kaddfd.file)
+ return -EBADF;
+
+ kaddfd.flags = addfd.newfd_flags;
+ kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ?
+ addfd.newfd : -1;
+ init_completion(&kaddfd.completion);
+
+ ret = mutex_lock_interruptible(&filter->notify_lock);
+ if (ret < 0)
+ goto out;
+
+ knotif = find_notification(filter, addfd.id);
+ if (!knotif) {
+ ret = -ENOENT;
+ goto out_unlock;
}
-out:
+ /*
+ * We do not want to allow for FD injection to occur before the
+ * notification has been picked up by a userspace handler, or after
+ * the notification has been replied to.
+ */
+ if (knotif->state != SECCOMP_NOTIFY_SENT) {
+ ret = -EINPROGRESS;
+ goto out_unlock;
+ }
+
+ list_add(&kaddfd.list, &knotif->addfd);
+ complete(&knotif->ready);
+ mutex_unlock(&filter->notify_lock);
+
+ /* Now we wait for it to be processed or be interrupted */
+ ret = wait_for_completion_interruptible(&kaddfd.completion);
+ if (ret == 0) {
+ /*
+ * We had a successful completion. The other side has already
+ * removed us from the addfd queue, and
+ * wait_for_completion_interruptible has a memory barrier upon
+ * success that lets us read this value directly without
+ * locking.
+ */
+ ret = kaddfd.ret;
+ goto out;
+ }
+
+ mutex_lock(&filter->notify_lock);
+ /*
+ * Even though we were woken up by a signal and not a successful
+ * completion, a completion may have happened in the mean time.
+ *
+ * We need to check again if the addfd request has been handled,
+ * and if not, we will remove it from the queue.
+ */
+ if (list_empty(&kaddfd.list))
+ ret = kaddfd.ret;
+ else
+ list_del(&kaddfd.list);
+
+out_unlock:
mutex_unlock(&filter->notify_lock);
+out:
+ fput(kaddfd.file);
+
return ret;
}
@@ -1181,13 +1402,22 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
struct seccomp_filter *filter = file->private_data;
void __user *buf = (void __user *)arg;
+ /* Fixed-size ioctls */
switch (cmd) {
case SECCOMP_IOCTL_NOTIF_RECV:
return seccomp_notify_recv(filter, buf);
case SECCOMP_IOCTL_NOTIF_SEND:
return seccomp_notify_send(filter, buf);
+ case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
case SECCOMP_IOCTL_NOTIF_ID_VALID:
return seccomp_notify_id_valid(filter, buf);
+ }
+
+ /* Extensible Argument ioctls */
+#define EA_IOCTL(cmd) ((cmd) & ~(IOC_INOUT | IOCSIZE_MASK))
+ switch (EA_IOCTL(cmd)) {
+ case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD):
+ return seccomp_notify_addfd(filter, buf, _IOC_SIZE(cmd));
default:
return -EINVAL;
}
@@ -1200,7 +1430,7 @@ static __poll_t seccomp_notify_poll(struct file *file,
__poll_t ret = 0;
struct seccomp_knotif *cur;
- poll_wait(file, &filter->notif->wqh, poll_tab);
+ poll_wait(file, &filter->wqh, poll_tab);
if (mutex_lock_interruptible(&filter->notify_lock) < 0)
return EPOLLERR;
@@ -1216,6 +1446,9 @@ static __poll_t seccomp_notify_poll(struct file *file,
mutex_unlock(&filter->notify_lock);
+ if (refcount_read(&filter->users) == 0)
+ ret |= EPOLLHUP;
+
return ret;
}
@@ -1244,7 +1477,6 @@ static struct file *init_listener(struct seccomp_filter *filter)
sema_init(&filter->notif->request, 0);
filter->notif->next_id = get_random_u64();
INIT_LIST_HEAD(&filter->notif->notifications);
- init_waitqueue_head(&filter->notif->wqh);
ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops,
filter, O_RDWR);
@@ -1822,7 +2054,7 @@ static int __init seccomp_sysctl_init(void)
hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table);
if (!hdr)
- pr_warn("seccomp: sysctl registration failed\n");
+ pr_warn("sysctl registration failed\n");
else
kmemleak_not_leak(hdr);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 5e9aaa648a74..bf88d7f62433 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -553,7 +553,10 @@ static void tasklet_action_common(struct softirq_action *a,
if (!test_and_clear_bit(TASKLET_STATE_SCHED,
&t->state))
BUG();
- t->func(t->data);
+ if (t->use_callback)
+ t->callback(t);
+ else
+ t->func(t->data);
tasklet_unlock(t);
continue;
}
@@ -579,6 +582,18 @@ static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
}
+void tasklet_setup(struct tasklet_struct *t,
+ void (*callback)(struct tasklet_struct *))
+{
+ t->next = NULL;
+ t->state = 0;
+ atomic_set(&t->count, 0);
+ t->callback = callback;
+ t->use_callback = true;
+ t->data = 0;
+}
+EXPORT_SYMBOL(tasklet_setup);
+
void tasklet_init(struct tasklet_struct *t,
void (*func)(unsigned long), unsigned long data)
{
@@ -586,6 +601,7 @@ void tasklet_init(struct tasklet_struct *t,
t->state = 0;
atomic_set(&t->count, 0);
t->func = func;
+ t->use_callback = false;
t->data = data;
}
EXPORT_SYMBOL(tasklet_init);
diff --git a/kernel/stackleak.c b/kernel/stackleak.c
index b193a59fc05b..a8fc9ae1d03d 100644
--- a/kernel/stackleak.c
+++ b/kernel/stackleak.c
@@ -104,19 +104,9 @@ asmlinkage void notrace stackleak_erase(void)
}
NOKPROBE_SYMBOL(stackleak_erase);
-void __used notrace stackleak_track_stack(void)
+void __used __no_caller_saved_registers notrace stackleak_track_stack(void)
{
- /*
- * N.B. stackleak_erase() fills the kernel stack with the poison value,
- * which has the register width. That code assumes that the value
- * of 'lowest_stack' is aligned on the register width boundary.
- *
- * That is true for x86 and x86_64 because of the kernel stack
- * alignment on these platforms (for details, see 'cc_stack_align' in
- * arch/x86/Makefile). Take care of that when you port STACKLEAK to
- * new platforms.
- */
- unsigned long sp = (unsigned long)&sp;
+ unsigned long sp = current_stack_pointer;
/*
* Having CONFIG_STACKLEAK_TRACK_MIN_SIZE larger than
@@ -125,6 +115,8 @@ void __used notrace stackleak_track_stack(void)
*/
BUILD_BUG_ON(CONFIG_STACKLEAK_TRACK_MIN_SIZE > STACKLEAK_SEARCH_DEPTH);
+ /* 'lowest_stack' should be aligned on the register width boundary */
+ sp = ALIGN(sp, sizeof(unsigned long));
if (sp < current->lowest_stack &&
sp >= (unsigned long)task_stack_page(current) +
sizeof(unsigned long)) {
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 2af66e449aa6..946f44a9e86a 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -233,10 +233,9 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
if (current->flags & PF_KTHREAD)
return 0;
- fs = get_fs();
- set_fs(USER_DS);
+ fs = force_uaccess_begin();
arch_stack_walk_user(consume_entry, &c, task_pt_regs(current));
- set_fs(fs);
+ force_uaccess_end(fs);
return c.len;
}
diff --git a/kernel/sys.c b/kernel/sys.c
index 00a96746e28a..ca11af9d815d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2007,12 +2007,15 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
if (prctl_map.exe_fd != (u32)-1) {
/*
- * Make sure the caller has the rights to
- * change /proc/pid/exe link: only local sys admin should
- * be allowed to.
+ * Check if the current user is checkpoint/restore capable.
+ * At the time of this writing, it checks for CAP_SYS_ADMIN
+ * or CAP_CHECKPOINT_RESTORE.
+ * Note that a user with access to ptrace can masquerade an
+ * arbitrary program as any executable, even setuid ones.
+ * This may have implications in the tomoyo subsystem.
*/
- if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
- return -EINVAL;
+ if (!checkpoint_restore_ns_capable(current_user_ns()))
+ return -EPERM;
error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
if (error)
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 3b69a560a7ac..4d59775ea79c 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -364,7 +364,6 @@ COND_SYSCALL(socketcall);
COND_SYSCALL_COMPAT(socketcall);
/* compat syscalls for arm64, x86, ... */
-COND_SYSCALL_COMPAT(sysctl);
COND_SYSCALL_COMPAT(fanotify_mark);
/* x86 */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1b4d2dc270a5..287862f91717 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2671,7 +2671,7 @@ static struct ctl_table vm_table[] = {
.data = &sysctl_overcommit_memory,
.maxlen = sizeof(sysctl_overcommit_memory),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = overcommit_policy_handler,
.extra1 = SYSCTL_ZERO,
.extra2 = &two,
},
@@ -2852,6 +2852,15 @@ static struct ctl_table vm_table[] = {
.proc_handler = sysctl_compaction_handler,
},
{
+ .procname = "compaction_proactiveness",
+ .data = &sysctl_compaction_proactiveness,
+ .maxlen = sizeof(sysctl_compaction_proactiveness),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &one_hundred,
+ },
+ {
.procname = "extfrag_threshold",
.data = &sysctl_extfrag_threshold,
.maxlen = sizeof(int),
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
deleted file mode 100644
index 7d550cc76a3b..000000000000
--- a/kernel/sysctl_binary.c
+++ /dev/null
@@ -1,171 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/stat.h>
-#include <linux/sysctl.h>
-#include "../fs/xfs/xfs_sysctl.h"
-#include <linux/sunrpc/debug.h>
-#include <linux/string.h>
-#include <linux/syscalls.h>
-#include <linux/namei.h>
-#include <linux/mount.h>
-#include <linux/fs.h>
-#include <linux/nsproxy.h>
-#include <linux/pid_namespace.h>
-#include <linux/file.h>
-#include <linux/ctype.h>
-#include <linux/netdevice.h>
-#include <linux/kernel.h>
-#include <linux/uuid.h>
-#include <linux/slab.h>
-#include <linux/compat.h>
-
-static ssize_t binary_sysctl(const int *name, int nlen,
- void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
- return -ENOSYS;
-}
-
-static void deprecated_sysctl_warning(const int *name, int nlen)
-{
- int i;
-
- /*
- * CTL_KERN/KERN_VERSION is used by older glibc and cannot
- * ever go away.
- */
- if (nlen >= 2 && name[0] == CTL_KERN && name[1] == KERN_VERSION)
- return;
-
- if (printk_ratelimit()) {
- printk(KERN_INFO
- "warning: process `%s' used the deprecated sysctl "
- "system call with ", current->comm);
- for (i = 0; i < nlen; i++)
- printk(KERN_CONT "%d.", name[i]);
- printk(KERN_CONT "\n");
- }
- return;
-}
-
-#define WARN_ONCE_HASH_BITS 8
-#define WARN_ONCE_HASH_SIZE (1<<WARN_ONCE_HASH_BITS)
-
-static DECLARE_BITMAP(warn_once_bitmap, WARN_ONCE_HASH_SIZE);
-
-#define FNV32_OFFSET 2166136261U
-#define FNV32_PRIME 0x01000193
-
-/*
- * Print each legacy sysctl (approximately) only once.
- * To avoid making the tables non-const use a external
- * hash-table instead.
- * Worst case hash collision: 6, but very rarely.
- * NOTE! We don't use the SMP-safe bit tests. We simply
- * don't care enough.
- */
-static void warn_on_bintable(const int *name, int nlen)
-{
- int i;
- u32 hash = FNV32_OFFSET;
-
- for (i = 0; i < nlen; i++)
- hash = (hash ^ name[i]) * FNV32_PRIME;
- hash %= WARN_ONCE_HASH_SIZE;
- if (__test_and_set_bit(hash, warn_once_bitmap))
- return;
- deprecated_sysctl_warning(name, nlen);
-}
-
-static ssize_t do_sysctl(int __user *args_name, int nlen,
- void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
- int name[CTL_MAXNAME];
- int i;
-
- /* Check args->nlen. */
- if (nlen < 0 || nlen > CTL_MAXNAME)
- return -ENOTDIR;
- /* Read in the sysctl name for simplicity */
- for (i = 0; i < nlen; i++)
- if (get_user(name[i], args_name + i))
- return -EFAULT;
-
- warn_on_bintable(name, nlen);
-
- return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen);
-}
-
-SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
-{
- struct __sysctl_args tmp;
- size_t oldlen = 0;
- ssize_t result;
-
- if (copy_from_user(&tmp, args, sizeof(tmp)))
- return -EFAULT;
-
- if (tmp.oldval && !tmp.oldlenp)
- return -EFAULT;
-
- if (tmp.oldlenp && get_user(oldlen, tmp.oldlenp))
- return -EFAULT;
-
- result = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, oldlen,
- tmp.newval, tmp.newlen);
-
- if (result >= 0) {
- oldlen = result;
- result = 0;
- }
-
- if (tmp.oldlenp && put_user(oldlen, tmp.oldlenp))
- return -EFAULT;
-
- return result;
-}
-
-
-#ifdef CONFIG_COMPAT
-
-struct compat_sysctl_args {
- compat_uptr_t name;
- int nlen;
- compat_uptr_t oldval;
- compat_uptr_t oldlenp;
- compat_uptr_t newval;
- compat_size_t newlen;
- compat_ulong_t __unused[4];
-};
-
-COMPAT_SYSCALL_DEFINE1(sysctl, struct compat_sysctl_args __user *, args)
-{
- struct compat_sysctl_args tmp;
- compat_size_t __user *compat_oldlenp;
- size_t oldlen = 0;
- ssize_t result;
-
- if (copy_from_user(&tmp, args, sizeof(tmp)))
- return -EFAULT;
-
- if (tmp.oldval && !tmp.oldlenp)
- return -EFAULT;
-
- compat_oldlenp = compat_ptr(tmp.oldlenp);
- if (compat_oldlenp && get_user(oldlen, compat_oldlenp))
- return -EFAULT;
-
- result = do_sysctl(compat_ptr(tmp.name), tmp.nlen,
- compat_ptr(tmp.oldval), oldlen,
- compat_ptr(tmp.newval), tmp.newlen);
-
- if (result >= 0) {
- oldlen = result;
- result = 0;
- }
-
- if (compat_oldlenp && put_user(oldlen, compat_oldlenp))
- return -EFAULT;
-
- return result;
-}
-
-#endif /* CONFIG_COMPAT */
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index fcc42353f125..a09b1d61df6a 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -52,6 +52,15 @@ config GENERIC_CLOCKEVENTS_MIN_ADJUST
config GENERIC_CMOS_UPDATE
bool
+# Select to handle posix CPU timers from task_work
+# and not from the timer interrupt context
+config HAVE_POSIX_CPU_TIMERS_TASK_WORK
+ bool
+
+config POSIX_CPU_TIMERS_TASK_WORK
+ bool
+ default y if POSIX_TIMERS && HAVE_POSIX_CPU_TIMERS_TASK_WORK
+
if GENERIC_CLOCKEVENTS
menu "Timers subsystem"
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 2ffb466af77e..ca223a89530a 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -192,7 +192,7 @@ static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm)
* When a alarm timer fires, this runs through the timerqueue to
* see which alarms expired, and runs those. If there are more alarm
* timers queued for the future, we set the hrtimer to fire when
- * when the next future alarm timer expires.
+ * the next future alarm timer expires.
*/
static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
{
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index d89da1c7e005..c4038511d5c9 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -135,7 +135,11 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
* timer->base->cpu_base
*/
static struct hrtimer_cpu_base migration_cpu_base = {
- .clock_base = { { .cpu_base = &migration_cpu_base, }, },
+ .clock_base = { {
+ .cpu_base = &migration_cpu_base,
+ .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq,
+ &migration_cpu_base.lock),
+ }, },
};
#define migration_base migration_cpu_base.clock_base[0]
@@ -1998,8 +2002,11 @@ int hrtimers_prepare_cpu(unsigned int cpu)
int i;
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
- cpu_base->clock_base[i].cpu_base = cpu_base;
- timerqueue_init_head(&cpu_base->clock_base[i].active);
+ struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i];
+
+ clock_b->cpu_base = cpu_base;
+ seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock);
+ timerqueue_init_head(&clock_b->active);
}
cpu_base->cpu = cpu;
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 5d9fc22d836a..afc65e6be33e 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -280,11 +280,16 @@ static void timens_put(struct ns_common *ns)
put_time_ns(to_time_ns(ns));
}
+void timens_commit(struct task_struct *tsk, struct time_namespace *ns)
+{
+ timens_set_vvar_page(tsk, ns);
+ vdso_join_timens(tsk, ns);
+}
+
static int timens_install(struct nsset *nsset, struct ns_common *new)
{
struct nsproxy *nsproxy = nsset->nsproxy;
struct time_namespace *ns = to_time_ns(new);
- int err;
if (!current_is_single_threaded())
return -EUSERS;
@@ -293,12 +298,6 @@ static int timens_install(struct nsset *nsset, struct ns_common *new)
!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
- timens_set_vvar_page(current, ns);
-
- err = vdso_join_timens(current, ns);
- if (err)
- return err;
-
get_time_ns(ns);
put_time_ns(nsproxy->time_ns);
nsproxy->time_ns = ns;
@@ -313,22 +312,17 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
{
struct ns_common *nsc = &nsproxy->time_ns_for_children->ns;
struct time_namespace *ns = to_time_ns(nsc);
- int err;
/* create_new_namespaces() already incremented the ref counter */
if (nsproxy->time_ns == nsproxy->time_ns_for_children)
return 0;
- timens_set_vvar_page(tsk, ns);
-
- err = vdso_join_timens(tsk, ns);
- if (err)
- return err;
-
get_time_ns(ns);
put_time_ns(nsproxy->time_ns);
nsproxy->time_ns = ns;
+ timens_commit(tsk, ns);
+
return 0;
}
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 165117996ea0..a71758e34e45 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -377,6 +377,7 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp)
*/
static int posix_cpu_timer_create(struct k_itimer *new_timer)
{
+ static struct lock_class_key posix_cpu_timers_key;
struct pid *pid;
rcu_read_lock();
@@ -386,6 +387,17 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer)
return -EINVAL;
}
+ /*
+ * If posix timer expiry is handled in task work context then
+ * timer::it_lock can be taken without disabling interrupts as all
+ * other locking happens in task context. This requires a seperate
+ * lock class key otherwise regular posix timer expiry would record
+ * the lock class being taken in interrupt context and generate a
+ * false positive warning.
+ */
+ if (IS_ENABLED(CONFIG_POSIX_CPU_TIMERS_TASK_WORK))
+ lockdep_set_class(&new_timer->it_lock, &posix_cpu_timers_key);
+
new_timer->kclock = &clock_posix_cpu;
timerqueue_init(&new_timer->it.cpu.node);
new_timer->it.cpu.pid = get_pid(pid);
@@ -1080,43 +1092,163 @@ static inline bool fastpath_timer_check(struct task_struct *tsk)
return false;
}
+static void handle_posix_cpu_timers(struct task_struct *tsk);
+
+#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
+static void posix_cpu_timers_work(struct callback_head *work)
+{
+ handle_posix_cpu_timers(current);
+}
+
/*
- * This is called from the timer interrupt handler. The irq handler has
- * already updated our counts. We need to check if any timers fire now.
- * Interrupts are disabled.
+ * Initialize posix CPU timers task work in init task. Out of line to
+ * keep the callback static and to avoid header recursion hell.
*/
-void run_posix_cpu_timers(void)
+void __init posix_cputimers_init_work(void)
{
- struct task_struct *tsk = current;
- struct k_itimer *timer, *next;
- unsigned long flags;
- LIST_HEAD(firing);
+ init_task_work(&current->posix_cputimers_work.work,
+ posix_cpu_timers_work);
+}
- lockdep_assert_irqs_disabled();
+/*
+ * Note: All operations on tsk->posix_cputimer_work.scheduled happen either
+ * in hard interrupt context or in task context with interrupts
+ * disabled. Aside of that the writer/reader interaction is always in the
+ * context of the current task, which means they are strict per CPU.
+ */
+static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
+{
+ return tsk->posix_cputimers_work.scheduled;
+}
- /*
- * The fast path checks that there are no expired thread or thread
- * group timers. If that's so, just return.
- */
- if (!fastpath_timer_check(tsk))
+static inline void __run_posix_cpu_timers(struct task_struct *tsk)
+{
+ if (WARN_ON_ONCE(tsk->posix_cputimers_work.scheduled))
return;
- lockdep_posixtimer_enter();
- if (!lock_task_sighand(tsk, &flags)) {
- lockdep_posixtimer_exit();
- return;
+ /* Schedule task work to actually expire the timers */
+ tsk->posix_cputimers_work.scheduled = true;
+ task_work_add(tsk, &tsk->posix_cputimers_work.work, TWA_RESUME);
+}
+
+static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk,
+ unsigned long start)
+{
+ bool ret = true;
+
+ /*
+ * On !RT kernels interrupts are disabled while collecting expired
+ * timers, so no tick can happen and the fast path check can be
+ * reenabled without further checks.
+ */
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ tsk->posix_cputimers_work.scheduled = false;
+ return true;
}
+
/*
- * Here we take off tsk->signal->cpu_timers[N] and
- * tsk->cpu_timers[N] all the timers that are firing, and
- * put them on the firing list.
+ * On RT enabled kernels ticks can happen while the expired timers
+ * are collected under sighand lock. But any tick which observes
+ * the CPUTIMERS_WORK_SCHEDULED bit set, does not run the fastpath
+ * checks. So reenabling the tick work has do be done carefully:
+ *
+ * Disable interrupts and run the fast path check if jiffies have
+ * advanced since the collecting of expired timers started. If
+ * jiffies have not advanced or the fast path check did not find
+ * newly expired timers, reenable the fast path check in the timer
+ * interrupt. If there are newly expired timers, return false and
+ * let the collection loop repeat.
*/
- check_thread_timers(tsk, &firing);
+ local_irq_disable();
+ if (start != jiffies && fastpath_timer_check(tsk))
+ ret = false;
+ else
+ tsk->posix_cputimers_work.scheduled = false;
+ local_irq_enable();
+
+ return ret;
+}
+#else /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */
+static inline void __run_posix_cpu_timers(struct task_struct *tsk)
+{
+ lockdep_posixtimer_enter();
+ handle_posix_cpu_timers(tsk);
+ lockdep_posixtimer_exit();
+}
+
+static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
+{
+ return false;
+}
+
+static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk,
+ unsigned long start)
+{
+ return true;
+}
+#endif /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */
+
+static void handle_posix_cpu_timers(struct task_struct *tsk)
+{
+ struct k_itimer *timer, *next;
+ unsigned long flags, start;
+ LIST_HEAD(firing);
+
+ if (!lock_task_sighand(tsk, &flags))
+ return;
- check_process_timers(tsk, &firing);
+ do {
+ /*
+ * On RT locking sighand lock does not disable interrupts,
+ * so this needs to be careful vs. ticks. Store the current
+ * jiffies value.
+ */
+ start = READ_ONCE(jiffies);
+ barrier();
+
+ /*
+ * Here we take off tsk->signal->cpu_timers[N] and
+ * tsk->cpu_timers[N] all the timers that are firing, and
+ * put them on the firing list.
+ */
+ check_thread_timers(tsk, &firing);
+
+ check_process_timers(tsk, &firing);
+
+ /*
+ * The above timer checks have updated the exipry cache and
+ * because nothing can have queued or modified timers after
+ * sighand lock was taken above it is guaranteed to be
+ * consistent. So the next timer interrupt fastpath check
+ * will find valid data.
+ *
+ * If timer expiry runs in the timer interrupt context then
+ * the loop is not relevant as timers will be directly
+ * expired in interrupt context. The stub function below
+ * returns always true which allows the compiler to
+ * optimize the loop out.
+ *
+ * If timer expiry is deferred to task work context then
+ * the following rules apply:
+ *
+ * - On !RT kernels no tick can have happened on this CPU
+ * after sighand lock was acquired because interrupts are
+ * disabled. So reenabling task work before dropping
+ * sighand lock and reenabling interrupts is race free.
+ *
+ * - On RT kernels ticks might have happened but the tick
+ * work ignored posix CPU timer handling because the
+ * CPUTIMERS_WORK_SCHEDULED bit is set. Reenabling work
+ * must be done very carefully including a check whether
+ * ticks have happened since the start of the timer
+ * expiry checks. posix_cpu_timers_enable_work() takes
+ * care of that and eventually lets the expiry checks
+ * run again.
+ */
+ } while (!posix_cpu_timers_enable_work(tsk, start));
/*
- * We must release these locks before taking any timer's lock.
+ * We must release sighand lock before taking any timer's lock.
* There is a potential race with timer deletion here, as the
* siglock now protects our private firing list. We have set
* the firing flag in each timer, so that a deletion attempt
@@ -1134,6 +1266,13 @@ void run_posix_cpu_timers(void)
list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) {
int cpu_firing;
+ /*
+ * spin_lock() is sufficient here even independent of the
+ * expiry context. If expiry happens in hard interrupt
+ * context it's obvious. For task work context it's safe
+ * because all other operations on timer::it_lock happen in
+ * task context (syscall or exit).
+ */
spin_lock(&timer->it_lock);
list_del_init(&timer->it.cpu.elist);
cpu_firing = timer->it.cpu.firing;
@@ -1147,7 +1286,34 @@ void run_posix_cpu_timers(void)
cpu_timer_fire(timer);
spin_unlock(&timer->it_lock);
}
- lockdep_posixtimer_exit();
+}
+
+/*
+ * This is called from the timer interrupt handler. The irq handler has
+ * already updated our counts. We need to check if any timers fire now.
+ * Interrupts are disabled.
+ */
+void run_posix_cpu_timers(void)
+{
+ struct task_struct *tsk = current;
+
+ lockdep_assert_irqs_disabled();
+
+ /*
+ * If the actual expiry is deferred to task work context and the
+ * work is already scheduled there is no point to do anything here.
+ */
+ if (posix_cpu_timers_work_scheduled(tsk))
+ return;
+
+ /*
+ * The fast path checks that there are no expired thread or thread
+ * group timers. If that's so, just return.
+ */
+ if (!fastpath_timer_check(tsk))
+ return;
+
+ __run_posix_cpu_timers(tsk);
}
/*
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 0deaf4b79fb4..1c03eec6ca9b 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -229,7 +229,7 @@ void __init generic_sched_clock_init(void)
{
/*
* If no sched_clock() function has been provided at that point,
- * make it the final one one.
+ * make it the final one.
*/
if (cd.actual_read_sched_clock == jiffy_sched_clock_read)
sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 63a632f9896c..4c47f388a83f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -39,18 +39,19 @@ enum timekeeping_adv_mode {
TK_ADV_FREQ
};
+DEFINE_RAW_SPINLOCK(timekeeper_lock);
+
/*
* The most important data for readout fits into a single 64 byte
* cache line.
*/
static struct {
- seqcount_t seq;
+ seqcount_raw_spinlock_t seq;
struct timekeeper timekeeper;
} tk_core ____cacheline_aligned = {
- .seq = SEQCNT_ZERO(tk_core.seq),
+ .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock),
};
-static DEFINE_RAW_SPINLOCK(timekeeper_lock);
static struct timekeeper shadow_timekeeper;
/**
@@ -63,7 +64,7 @@ static struct timekeeper shadow_timekeeper;
* See @update_fast_timekeeper() below.
*/
struct tk_fast {
- seqcount_t seq;
+ seqcount_raw_spinlock_t seq;
struct tk_read_base base[2];
};
@@ -80,11 +81,13 @@ static struct clocksource dummy_clock = {
};
static struct tk_fast tk_fast_mono ____cacheline_aligned = {
+ .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_mono.seq, &timekeeper_lock),
.base[0] = { .clock = &dummy_clock, },
.base[1] = { .clock = &dummy_clock, },
};
static struct tk_fast tk_fast_raw ____cacheline_aligned = {
+ .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_raw.seq, &timekeeper_lock),
.base[0] = { .clock = &dummy_clock, },
.base[1] = { .clock = &dummy_clock, },
};
@@ -157,7 +160,7 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
* tk_clock_read - atomic clocksource read() helper
*
* This helper is necessary to use in the read paths because, while the
- * seqlock ensures we don't return a bad value while structures are updated,
+ * seqcount ensures we don't return a bad value while structures are updated,
* it doesn't protect from potential crashes. There is the possibility that
* the tkr's clocksource may change between the read reference, and the
* clock reference passed to the read function. This can cause crashes if
@@ -222,10 +225,10 @@ static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr)
unsigned int seq;
/*
- * Since we're called holding a seqlock, the data may shift
+ * Since we're called holding a seqcount, the data may shift
* under us while we're doing the calculation. This can cause
* false positives, since we'd note a problem but throw the
- * results away. So nest another seqlock here to atomically
+ * results away. So nest another seqcount here to atomically
* grab the points we are checking with.
*/
do {
@@ -486,7 +489,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
*
* To keep it NMI safe since we're accessing from tracing, we're not using a
* separate timekeeper with updates to monotonic clock and boot offset
- * protected with seqlocks. This has the following minor side effects:
+ * protected with seqcounts. This has the following minor side effects:
*
* (1) Its possible that a timestamp be taken after the boot offset is updated
* but before the timekeeper is updated. If this happens, the new boot offset
@@ -2001,7 +2004,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
* logarithmic_accumulation - shifted accumulation of cycles
*
* This functions accumulates a shifted interval of cycles into
- * into a shifted interval nanoseconds. Allows for O(log) accumulation
+ * a shifted interval nanoseconds. Allows for O(log) accumulation
* loop.
*
* Returns the unconsumed cycles.
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index bcbb52db2256..4ca2787d1642 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -1,12 +1,14 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _TIMEKEEPING_INTERNAL_H
#define _TIMEKEEPING_INTERNAL_H
-/*
- * timekeeping debug functions
- */
+
#include <linux/clocksource.h>
+#include <linux/spinlock.h>
#include <linux/time.h>
+/*
+ * timekeeping debug functions
+ */
#ifdef CONFIG_DEBUG_FS
extern void tk_debug_account_sleep_time(const struct timespec64 *t);
#else
@@ -31,4 +33,7 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
}
#endif
+/* Semi public for serialization of non timekeeper VDSO updates. */
+extern raw_spinlock_t timekeeper_lock;
+
#endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 026ac01af9da..a16764b0116e 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -157,7 +157,8 @@ EXPORT_SYMBOL(jiffies_64);
/*
* The time start value for each level to select the bucket at enqueue
- * time.
+ * time. We start from the last possible delta of the previous level
+ * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()).
*/
#define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
@@ -204,8 +205,8 @@ struct timer_base {
unsigned long clk;
unsigned long next_expiry;
unsigned int cpu;
+ bool next_expiry_recalc;
bool is_idle;
- bool must_forward_clk;
DECLARE_BITMAP(pending_map, WHEEL_SIZE);
struct hlist_head vectors[WHEEL_SIZE];
} ____cacheline_aligned;
@@ -488,35 +489,48 @@ static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
* Helper function to calculate the array index for a given expiry
* time.
*/
-static inline unsigned calc_index(unsigned expires, unsigned lvl)
+static inline unsigned calc_index(unsigned long expires, unsigned lvl,
+ unsigned long *bucket_expiry)
{
+
+ /*
+ * The timer wheel has to guarantee that a timer does not fire
+ * early. Early expiry can happen due to:
+ * - Timer is armed at the edge of a tick
+ * - Truncation of the expiry time in the outer wheel levels
+ *
+ * Round up with level granularity to prevent this.
+ */
expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl);
+ *bucket_expiry = expires << LVL_SHIFT(lvl);
return LVL_OFFS(lvl) + (expires & LVL_MASK);
}
-static int calc_wheel_index(unsigned long expires, unsigned long clk)
+static int calc_wheel_index(unsigned long expires, unsigned long clk,
+ unsigned long *bucket_expiry)
{
unsigned long delta = expires - clk;
unsigned int idx;
if (delta < LVL_START(1)) {
- idx = calc_index(expires, 0);
+ idx = calc_index(expires, 0, bucket_expiry);
} else if (delta < LVL_START(2)) {
- idx = calc_index(expires, 1);
+ idx = calc_index(expires, 1, bucket_expiry);
} else if (delta < LVL_START(3)) {
- idx = calc_index(expires, 2);
+ idx = calc_index(expires, 2, bucket_expiry);
} else if (delta < LVL_START(4)) {
- idx = calc_index(expires, 3);
+ idx = calc_index(expires, 3, bucket_expiry);
} else if (delta < LVL_START(5)) {
- idx = calc_index(expires, 4);
+ idx = calc_index(expires, 4, bucket_expiry);
} else if (delta < LVL_START(6)) {
- idx = calc_index(expires, 5);
+ idx = calc_index(expires, 5, bucket_expiry);
} else if (delta < LVL_START(7)) {
- idx = calc_index(expires, 6);
+ idx = calc_index(expires, 6, bucket_expiry);
} else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
- idx = calc_index(expires, 7);
+ idx = calc_index(expires, 7, bucket_expiry);
} else if ((long) delta < 0) {
idx = clk & LVL_MASK;
+ *bucket_expiry = clk;
} else {
/*
* Force expire obscene large timeouts to expire at the
@@ -525,34 +539,11 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk)
if (delta >= WHEEL_TIMEOUT_CUTOFF)
expires = clk + WHEEL_TIMEOUT_MAX;
- idx = calc_index(expires, LVL_DEPTH - 1);
+ idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry);
}
return idx;
}
-/*
- * Enqueue the timer into the hash bucket, mark it pending in
- * the bitmap and store the index in the timer flags.
- */
-static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
- unsigned int idx)
-{
- hlist_add_head(&timer->entry, base->vectors + idx);
- __set_bit(idx, base->pending_map);
- timer_set_idx(timer, idx);
-
- trace_timer_start(timer, timer->expires, timer->flags);
-}
-
-static void
-__internal_add_timer(struct timer_base *base, struct timer_list *timer)
-{
- unsigned int idx;
-
- idx = calc_wheel_index(timer->expires, base->clk);
- enqueue_timer(base, timer, idx);
-}
-
static void
trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
{
@@ -574,34 +565,48 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
* timer is not deferrable. If the other CPU is on the way to idle
* then it can't set base->is_idle as we hold the base lock:
*/
- if (!base->is_idle)
- return;
+ if (base->is_idle)
+ wake_up_nohz_cpu(base->cpu);
+}
- /* Check whether this is the new first expiring timer: */
- if (time_after_eq(timer->expires, base->next_expiry))
- return;
+/*
+ * Enqueue the timer into the hash bucket, mark it pending in
+ * the bitmap, store the index in the timer flags then wake up
+ * the target CPU if needed.
+ */
+static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
+ unsigned int idx, unsigned long bucket_expiry)
+{
+
+ hlist_add_head(&timer->entry, base->vectors + idx);
+ __set_bit(idx, base->pending_map);
+ timer_set_idx(timer, idx);
+
+ trace_timer_start(timer, timer->expires, timer->flags);
/*
- * Set the next expiry time and kick the CPU so it can reevaluate the
- * wheel:
+ * Check whether this is the new first expiring timer. The
+ * effective expiry time of the timer is required here
+ * (bucket_expiry) instead of timer->expires.
*/
- if (time_before(timer->expires, base->clk)) {
+ if (time_before(bucket_expiry, base->next_expiry)) {
/*
- * Prevent from forward_timer_base() moving the base->clk
- * backward
+ * Set the next expiry time and kick the CPU so it
+ * can reevaluate the wheel:
*/
- base->next_expiry = base->clk;
- } else {
- base->next_expiry = timer->expires;
+ base->next_expiry = bucket_expiry;
+ base->next_expiry_recalc = false;
+ trigger_dyntick_cpu(base, timer);
}
- wake_up_nohz_cpu(base->cpu);
}
-static void
-internal_add_timer(struct timer_base *base, struct timer_list *timer)
+static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
{
- __internal_add_timer(base, timer);
- trigger_dyntick_cpu(base, timer);
+ unsigned long bucket_expiry;
+ unsigned int idx;
+
+ idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry);
+ enqueue_timer(base, timer, idx, bucket_expiry);
}
#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
@@ -834,8 +839,10 @@ static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
if (!timer_pending(timer))
return 0;
- if (hlist_is_singular_node(&timer->entry, base->vectors + idx))
+ if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) {
__clear_bit(idx, base->pending_map);
+ base->next_expiry_recalc = true;
+ }
detach_timer(timer, clear_pending);
return 1;
@@ -885,20 +892,14 @@ get_target_base(struct timer_base *base, unsigned tflags)
static inline void forward_timer_base(struct timer_base *base)
{
-#ifdef CONFIG_NO_HZ_COMMON
- unsigned long jnow;
+ unsigned long jnow = READ_ONCE(jiffies);
/*
- * We only forward the base when we are idle or have just come out of
- * idle (must_forward_clk logic), and have a delta between base clock
- * and jiffies. In the common case, run_timers will take care of it.
+ * No need to forward if we are close enough below jiffies.
+ * Also while executing timers, base->clk is 1 offset ahead
+ * of jiffies to avoid endless requeuing to current jffies.
*/
- if (likely(!base->must_forward_clk))
- return;
-
- jnow = READ_ONCE(jiffies);
- base->must_forward_clk = base->is_idle;
- if ((long)(jnow - base->clk) < 2)
+ if ((long)(jnow - base->clk) < 1)
return;
/*
@@ -912,7 +913,6 @@ static inline void forward_timer_base(struct timer_base *base)
return;
base->clk = base->next_expiry;
}
-#endif
}
@@ -960,9 +960,9 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
static inline int
__mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options)
{
+ unsigned long clk = 0, flags, bucket_expiry;
struct timer_base *base, *new_base;
unsigned int idx = UINT_MAX;
- unsigned long clk = 0, flags;
int ret = 0;
BUG_ON(!timer->function);
@@ -1001,7 +1001,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
}
clk = base->clk;
- idx = calc_wheel_index(expires, clk);
+ idx = calc_wheel_index(expires, clk, &bucket_expiry);
/*
* Retrieve and compare the array index of the pending
@@ -1054,16 +1054,13 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
/*
* If 'idx' was calculated above and the base time did not advance
* between calculating 'idx' and possibly switching the base, only
- * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise
- * we need to (re)calculate the wheel index via
- * internal_add_timer().
+ * enqueue_timer() is required. Otherwise we need to (re)calculate
+ * the wheel index via internal_add_timer().
*/
- if (idx != UINT_MAX && clk == base->clk) {
- enqueue_timer(base, timer, idx);
- trigger_dyntick_cpu(base, timer);
- } else {
+ if (idx != UINT_MAX && clk == base->clk)
+ enqueue_timer(base, timer, idx, bucket_expiry);
+ else
internal_add_timer(base, timer);
- }
out_unlock:
raw_spin_unlock_irqrestore(&base->lock, flags);
@@ -1466,10 +1463,10 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
}
}
-static int __collect_expired_timers(struct timer_base *base,
- struct hlist_head *heads)
+static int collect_expired_timers(struct timer_base *base,
+ struct hlist_head *heads)
{
- unsigned long clk = base->clk;
+ unsigned long clk = base->clk = base->next_expiry;
struct hlist_head *vec;
int i, levels = 0;
unsigned int idx;
@@ -1491,7 +1488,6 @@ static int __collect_expired_timers(struct timer_base *base,
return levels;
}
-#ifdef CONFIG_NO_HZ_COMMON
/*
* Find the next pending bucket of a level. Search from level start (@offset)
* + @clk upwards and if nothing there, search from start of the level
@@ -1524,6 +1520,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
clk = base->clk;
for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
+ unsigned long lvl_clk = clk & LVL_CLK_MASK;
if (pos >= 0) {
unsigned long tmp = clk + (unsigned long) pos;
@@ -1531,6 +1528,13 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
tmp <<= LVL_SHIFT(lvl);
if (time_before(tmp, next))
next = tmp;
+
+ /*
+ * If the next expiration happens before we reach
+ * the next level, no need to check further.
+ */
+ if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK))
+ break;
}
/*
* Clock for the next level. If the current level clock lower
@@ -1568,13 +1572,17 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
* So the simple check whether the lower bits of the current
* level are 0 or not is sufficient for all cases.
*/
- adj = clk & LVL_CLK_MASK ? 1 : 0;
+ adj = lvl_clk ? 1 : 0;
clk >>= LVL_CLK_SHIFT;
clk += adj;
}
+
+ base->next_expiry_recalc = false;
+
return next;
}
+#ifdef CONFIG_NO_HZ_COMMON
/*
* Check, if the next hrtimer event is before the next timer wheel
* event:
@@ -1631,9 +1639,11 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
return expires;
raw_spin_lock(&base->lock);
- nextevt = __next_timer_interrupt(base);
+ if (base->next_expiry_recalc)
+ base->next_expiry = __next_timer_interrupt(base);
+ nextevt = base->next_expiry;
is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
- base->next_expiry = nextevt;
+
/*
* We have a fresh next event. Check whether we can forward the
* base. We can only do that when @basej is past base->clk
@@ -1659,10 +1669,8 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
* logic is only maintained for the BASE_STD base, deferrable
* timers may still see large granularity skew (by design).
*/
- if ((expires - basem) > TICK_NSEC) {
- base->must_forward_clk = true;
+ if ((expires - basem) > TICK_NSEC)
base->is_idle = true;
- }
}
raw_spin_unlock(&base->lock);
@@ -1686,42 +1694,6 @@ void timer_clear_idle(void)
*/
base->is_idle = false;
}
-
-static int collect_expired_timers(struct timer_base *base,
- struct hlist_head *heads)
-{
- unsigned long now = READ_ONCE(jiffies);
-
- /*
- * NOHZ optimization. After a long idle sleep we need to forward the
- * base to current jiffies. Avoid a loop by searching the bitfield for
- * the next expiring timer.
- */
- if ((long)(now - base->clk) > 2) {
- unsigned long next = __next_timer_interrupt(base);
-
- /*
- * If the next timer is ahead of time forward to current
- * jiffies, otherwise forward to the next expiry time:
- */
- if (time_after(next, now)) {
- /*
- * The call site will increment base->clk and then
- * terminate the expiry loop immediately.
- */
- base->clk = now;
- return 0;
- }
- base->clk = next;
- }
- return __collect_expired_timers(base, heads);
-}
-#else
-static inline int collect_expired_timers(struct timer_base *base,
- struct hlist_head *heads)
-{
- return __collect_expired_timers(base, heads);
-}
#endif
/*
@@ -1761,32 +1733,23 @@ static inline void __run_timers(struct timer_base *base)
struct hlist_head heads[LVL_DEPTH];
int levels;
- if (!time_after_eq(jiffies, base->clk))
+ if (time_before(jiffies, base->next_expiry))
return;
timer_base_lock_expiry(base);
raw_spin_lock_irq(&base->lock);
- /*
- * timer_base::must_forward_clk must be cleared before running
- * timers so that any timer functions that call mod_timer() will
- * not try to forward the base. Idle tracking / clock forwarding
- * logic is only used with BASE_STD timers.
- *
- * The must_forward_clk flag is cleared unconditionally also for
- * the deferrable base. The deferrable base is not affected by idle
- * tracking and never forwarded, so clearing the flag is a NOOP.
- *
- * The fact that the deferrable base is never forwarded can cause
- * large variations in granularity for deferrable timers, but they
- * can be deferred for long periods due to idle anyway.
- */
- base->must_forward_clk = false;
-
- while (time_after_eq(jiffies, base->clk)) {
-
+ while (time_after_eq(jiffies, base->clk) &&
+ time_after_eq(jiffies, base->next_expiry)) {
levels = collect_expired_timers(base, heads);
+ /*
+ * The only possible reason for not finding any expired
+ * timer at this clk is that all matching timers have been
+ * dequeued.
+ */
+ WARN_ON_ONCE(!levels && !base->next_expiry_recalc);
base->clk++;
+ base->next_expiry = __next_timer_interrupt(base);
while (levels--)
expire_timers(base, heads + levels);
@@ -1816,12 +1779,12 @@ void run_local_timers(void)
hrtimer_run_queues();
/* Raise the softirq only if required. */
- if (time_before(jiffies, base->clk)) {
+ if (time_before(jiffies, base->next_expiry)) {
if (!IS_ENABLED(CONFIG_NO_HZ_COMMON))
return;
/* CPU is awake, so check the deferrable base. */
base++;
- if (time_before(jiffies, base->clk))
+ if (time_before(jiffies, base->next_expiry))
return;
}
raise_softirq(TIMER_SOFTIRQ);
@@ -1986,7 +1949,6 @@ int timers_prepare_cpu(unsigned int cpu)
base->clk = jiffies;
base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
base->is_idle = false;
- base->must_forward_clk = true;
}
return 0;
}
@@ -2039,6 +2001,7 @@ static void __init init_timer_cpu(int cpu)
base->cpu = cpu;
raw_spin_lock_init(&base->lock);
base->clk = jiffies;
+ base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
timer_base_init_expiry_lock(base);
}
}
@@ -2054,6 +2017,7 @@ static void __init init_timer_cpus(void)
void __init init_timers(void)
{
init_timer_cpus();
+ posix_cputimers_init_work();
open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index 54ce6eb2ca36..88e6b8ed6ca5 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -13,6 +13,8 @@
#include <vdso/helpers.h>
#include <vdso/vsyscall.h>
+#include "timekeeping_internal.h"
+
static inline void update_vdso_data(struct vdso_data *vdata,
struct timekeeper *tk)
{
@@ -127,3 +129,42 @@ void update_vsyscall_tz(void)
__arch_sync_vdso_data(vdata);
}
+
+/**
+ * vdso_update_begin - Start of a VDSO update section
+ *
+ * Allows architecture code to safely update the architecture specific VDSO
+ * data. Disables interrupts, acquires timekeeper lock to serialize against
+ * concurrent updates from timekeeping and invalidates the VDSO data
+ * sequence counter to prevent concurrent readers from accessing
+ * inconsistent data.
+ *
+ * Returns: Saved interrupt flags which need to be handed in to
+ * vdso_update_end().
+ */
+unsigned long vdso_update_begin(void)
+{
+ struct vdso_data *vdata = __arch_get_k_vdso_data();
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ vdso_write_begin(vdata);
+ return flags;
+}
+
+/**
+ * vdso_update_end - End of a VDSO update section
+ * @flags: Interrupt flags as returned from vdso_update_begin()
+ *
+ * Pairs with vdso_update_begin(). Marks vdso data consistent, invokes data
+ * synchronization if the architecture requires it, drops timekeeper lock
+ * and restores interrupt flags.
+ */
+void vdso_update_end(unsigned long flags)
+{
+ struct vdso_data *vdata = __arch_get_k_vdso_data();
+
+ vdso_write_end(vdata);
+ __arch_sync_vdso_data(vdata);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+}
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 6575bb0a0434..e153be351548 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -2,9 +2,9 @@
# Do not instrument the tracer itself:
+ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE)
+
ifdef CONFIG_FUNCTION_TRACER
-ORIG_CFLAGS := $(KBUILD_CFLAGS)
-KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))
# Avoid recursion due to instrumentation.
KCSAN_SANITIZE := n
@@ -31,6 +31,8 @@ ifdef CONFIG_GCOV_PROFILE_FTRACE
GCOV_PROFILE := y
endif
+CFLAGS_bpf_trace.o := -I$(src)
+
CFLAGS_trace_benchmark.o := -I$(src)
CFLAGS_trace_events_filter.o := -I$(src)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 7bc3d6175868..a8d4f253ed77 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -11,14 +11,19 @@
#include <linux/uaccess.h>
#include <linux/ctype.h>
#include <linux/kprobes.h>
+#include <linux/spinlock.h>
#include <linux/syscalls.h>
#include <linux/error-injection.h>
+#include <linux/btf_ids.h>
#include <asm/tlb.h>
#include "trace_probe.h"
#include "trace.h"
+#define CREATE_TRACE_POINTS
+#include "bpf_trace.h"
+
#define bpf_event_rcu_dereference(p) \
rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex))
@@ -374,9 +379,33 @@ static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
}
}
+static DEFINE_RAW_SPINLOCK(trace_printk_lock);
+
+#define BPF_TRACE_PRINTK_SIZE 1024
+
+static __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...)
+{
+ static char buf[BPF_TRACE_PRINTK_SIZE];
+ unsigned long flags;
+ va_list ap;
+ int ret;
+
+ raw_spin_lock_irqsave(&trace_printk_lock, flags);
+ va_start(ap, fmt);
+ ret = vsnprintf(buf, sizeof(buf), fmt, ap);
+ va_end(ap);
+ /* vsnprintf() will not append null for zero-length strings */
+ if (ret == 0)
+ buf[0] = '\0';
+ trace_bpf_trace_printk(buf);
+ raw_spin_unlock_irqrestore(&trace_printk_lock, flags);
+
+ return ret;
+}
+
/*
* Only limited trace_printk() conversion specifiers allowed:
- * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pks %pus %s
+ * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pB %pks %pus %s
*/
BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
u64, arg2, u64, arg3)
@@ -420,6 +449,11 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
goto fmt_str;
}
+ if (fmt[i + 1] == 'B') {
+ i++;
+ goto fmt_next;
+ }
+
/* disallow any further format extensions */
if (fmt[i + 1] != 0 &&
!isspace(fmt[i + 1]) &&
@@ -478,8 +512,7 @@ fmt_next:
*/
#define __BPF_TP_EMIT() __BPF_ARG3_TP()
#define __BPF_TP(...) \
- __trace_printk(0 /* Fake ip */, \
- fmt, ##__VA_ARGS__)
+ bpf_do_trace_printk(fmt, ##__VA_ARGS__)
#define __BPF_ARG1_TP(...) \
((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \
@@ -516,10 +549,15 @@ static const struct bpf_func_proto bpf_trace_printk_proto = {
const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
{
/*
- * this program might be calling bpf_trace_printk,
- * so allocate per-cpu printk buffers
+ * This program might be calling bpf_trace_printk,
+ * so enable the associated bpf_trace/bpf_trace_printk event.
+ * Repeat this each time as it is possible a user has
+ * disabled bpf_trace_printk events. By loading a program
+ * calling bpf_trace_printk() however the user has expressed
+ * the intent to see such events.
*/
- trace_printk_init_buffers();
+ if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1))
+ pr_warn_ratelimited("could not enable bpf_trace_printk events");
return &bpf_trace_printk_proto;
}
@@ -636,7 +674,8 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
if (fmt[i] == 'p') {
if (fmt[i + 1] == 0 ||
fmt[i + 1] == 'K' ||
- fmt[i + 1] == 'x') {
+ fmt[i + 1] == 'x' ||
+ fmt[i + 1] == 'B') {
/* just kernel pointers */
params[fmt_cnt] = args[fmt_cnt];
fmt_cnt++;
@@ -681,7 +720,8 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
}
if (fmt[i] != 'i' && fmt[i] != 'd' &&
- fmt[i] != 'u' && fmt[i] != 'x') {
+ fmt[i] != 'u' && fmt[i] != 'x' &&
+ fmt[i] != 'X') {
err = -EINVAL;
goto out;
}
@@ -703,7 +743,9 @@ out:
return err;
}
-static int bpf_seq_printf_btf_ids[5];
+BTF_ID_LIST(bpf_seq_printf_btf_ids)
+BTF_ID(struct, seq_file)
+
static const struct bpf_func_proto bpf_seq_printf_proto = {
.func = bpf_seq_printf,
.gpl_only = true,
@@ -721,7 +763,9 @@ BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len)
return seq_write(m, data, len) ? -EOVERFLOW : 0;
}
-static int bpf_seq_write_btf_ids[5];
+BTF_ID_LIST(bpf_seq_write_btf_ids)
+BTF_ID(struct, seq_file)
+
static const struct bpf_func_proto bpf_seq_write_proto = {
.func = bpf_seq_write,
.gpl_only = true,
@@ -1134,6 +1178,10 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_ringbuf_discard_proto;
case BPF_FUNC_ringbuf_query:
return &bpf_ringbuf_query_proto;
+ case BPF_FUNC_jiffies64:
+ return &bpf_jiffies64_proto;
+ case BPF_FUNC_get_task_stack:
+ return &bpf_get_task_stack_proto;
default:
return NULL;
}
@@ -1363,9 +1411,9 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_perf_event_output:
return &bpf_perf_event_output_proto_tp;
case BPF_FUNC_get_stackid:
- return &bpf_get_stackid_proto_tp;
+ return &bpf_get_stackid_proto_pe;
case BPF_FUNC_get_stack:
- return &bpf_get_stack_proto_tp;
+ return &bpf_get_stack_proto_pe;
case BPF_FUNC_perf_prog_read_value:
return &bpf_perf_prog_read_value_proto;
case BPF_FUNC_read_branch_records:
@@ -1512,6 +1560,16 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skb_output_proto;
case BPF_FUNC_xdp_output:
return &bpf_xdp_output_proto;
+ case BPF_FUNC_skc_to_tcp6_sock:
+ return &bpf_skc_to_tcp6_sock_proto;
+ case BPF_FUNC_skc_to_tcp_sock:
+ return &bpf_skc_to_tcp_sock_proto;
+ case BPF_FUNC_skc_to_tcp_timewait_sock:
+ return &bpf_skc_to_tcp_timewait_sock_proto;
+ case BPF_FUNC_skc_to_tcp_request_sock:
+ return &bpf_skc_to_tcp_request_sock_proto;
+ case BPF_FUNC_skc_to_udp6_sock:
+ return &bpf_skc_to_udp6_sock_proto;
#endif
case BPF_FUNC_seq_printf:
return prog->expected_attach_type == BPF_TRACE_ITER ?
diff --git a/kernel/trace/bpf_trace.h b/kernel/trace/bpf_trace.h
new file mode 100644
index 000000000000..9acbc11ac7bb
--- /dev/null
+++ b/kernel/trace/bpf_trace.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM bpf_trace
+
+#if !defined(_TRACE_BPF_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+
+#define _TRACE_BPF_TRACE_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(bpf_trace_printk,
+
+ TP_PROTO(const char *bpf_string),
+
+ TP_ARGS(bpf_string),
+
+ TP_STRUCT__entry(
+ __string(bpf_string, bpf_string)
+ ),
+
+ TP_fast_assign(
+ __assign_str(bpf_string, bpf_string);
+ ),
+
+ TP_printk("%s", __get_str(bpf_string))
+);
+
+#endif /* _TRACE_BPF_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE bpf_trace
+
+#include <trace/define_trace.h>
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 72064541bef2..275441254bb5 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -139,9 +139,6 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops)
#endif
}
-#define FTRACE_PID_IGNORE -1
-#define FTRACE_PID_TRACE -2
-
static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs)
{
@@ -2388,6 +2385,14 @@ struct ftrace_ops direct_ops = {
.flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE
| FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS
| FTRACE_OPS_FL_PERMANENT,
+ /*
+ * By declaring the main trampoline as this trampoline
+ * it will never have one allocated for it. Allocated
+ * trampolines should not call direct functions.
+ * The direct_ops should only be called by the builtin
+ * ftrace_regs_caller trampoline.
+ */
+ .trampoline = FTRACE_REGS_ADDR,
};
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
@@ -6255,8 +6260,19 @@ static int referenced_filters(struct dyn_ftrace *rec)
int cnt = 0;
for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
- if (ops_references_rec(ops, rec))
- cnt++;
+ if (ops_references_rec(ops, rec)) {
+ if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_DIRECT))
+ continue;
+ if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+ continue;
+ cnt++;
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
+ rec->flags |= FTRACE_FL_REGS;
+ if (cnt == 1 && ops->trampoline)
+ rec->flags |= FTRACE_FL_TRAMP;
+ else
+ rec->flags &= ~FTRACE_FL_TRAMP;
+ }
}
return cnt;
@@ -6435,8 +6451,8 @@ void ftrace_module_enable(struct module *mod)
if (ftrace_start_up)
cnt += referenced_filters(rec);
- /* This clears FTRACE_FL_DISABLED */
- rec->flags = cnt;
+ rec->flags &= ~FTRACE_FL_DISABLED;
+ rec->flags += cnt;
if (ftrace_start_up && cnt) {
int failed = __ftrace_replace_code(rec, 1);
@@ -7066,12 +7082,12 @@ void ftrace_pid_follow_fork(struct trace_array *tr, bool enable)
if (enable) {
register_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
tr);
- register_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+ register_trace_sched_process_free(ftrace_pid_follow_sched_process_exit,
tr);
} else {
unregister_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
tr);
- unregister_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+ unregister_trace_sched_process_free(ftrace_pid_follow_sched_process_exit,
tr);
}
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 00867ff82412..93ef0ab6ea20 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -270,6 +270,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
#define for_each_buffer_cpu(buffer, cpu) \
for_each_cpu(cpu, buffer->cpumask)
+#define for_each_online_buffer_cpu(buffer, cpu) \
+ for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask)
+
#define TS_SHIFT 27
#define TS_MASK ((1ULL << TS_SHIFT) - 1)
#define TS_DELTA_TEST (~TS_MASK)
@@ -413,12 +416,27 @@ struct rb_irq_work {
struct rb_event_info {
u64 ts;
u64 delta;
+ u64 before;
+ u64 after;
unsigned long length;
struct buffer_page *tail_page;
int add_timestamp;
};
/*
+ * Used for the add_timestamp
+ * NONE
+ * EXTEND - wants a time extend
+ * ABSOLUTE - the buffer requests all events to have absolute time stamps
+ * FORCE - force a full time stamp.
+ */
+enum {
+ RB_ADD_STAMP_NONE = 0,
+ RB_ADD_STAMP_EXTEND = BIT(1),
+ RB_ADD_STAMP_ABSOLUTE = BIT(2),
+ RB_ADD_STAMP_FORCE = BIT(3)
+};
+/*
* Used for which event context the event is in.
* NMI = 0
* IRQ = 1
@@ -435,6 +453,28 @@ enum {
RB_CTX_MAX
};
+#if BITS_PER_LONG == 32
+#define RB_TIME_32
+#endif
+
+/* To test on 64 bit machines */
+//#define RB_TIME_32
+
+#ifdef RB_TIME_32
+
+struct rb_time_struct {
+ local_t cnt;
+ local_t top;
+ local_t bottom;
+};
+#else
+#include <asm/local64.h>
+struct rb_time_struct {
+ local64_t time;
+};
+#endif
+typedef struct rb_time_struct rb_time_t;
+
/*
* head_page == tail_page && head == tail then buffer is empty.
*/
@@ -470,7 +510,8 @@ struct ring_buffer_per_cpu {
size_t shortest_full;
unsigned long read;
unsigned long read_bytes;
- u64 write_stamp;
+ rb_time_t write_stamp;
+ rb_time_t before_stamp;
u64 read_stamp;
/* ring buffer pages to update, > 0 to add, < 0 to remove */
long nr_pages_to_update;
@@ -513,6 +554,189 @@ struct ring_buffer_iter {
int missed_events;
};
+#ifdef RB_TIME_32
+
+/*
+ * On 32 bit machines, local64_t is very expensive. As the ring
+ * buffer doesn't need all the features of a true 64 bit atomic,
+ * on 32 bit, it uses these functions (64 still uses local64_t).
+ *
+ * For the ring buffer, 64 bit required operations for the time is
+ * the following:
+ *
+ * - Only need 59 bits (uses 60 to make it even).
+ * - Reads may fail if it interrupted a modification of the time stamp.
+ * It will succeed if it did not interrupt another write even if
+ * the read itself is interrupted by a write.
+ * It returns whether it was successful or not.
+ *
+ * - Writes always succeed and will overwrite other writes and writes
+ * that were done by events interrupting the current write.
+ *
+ * - A write followed by a read of the same time stamp will always succeed,
+ * but may not contain the same value.
+ *
+ * - A cmpxchg will fail if it interrupted another write or cmpxchg.
+ * Other than that, it acts like a normal cmpxchg.
+ *
+ * The 60 bit time stamp is broken up by 30 bits in a top and bottom half
+ * (bottom being the least significant 30 bits of the 60 bit time stamp).
+ *
+ * The two most significant bits of each half holds a 2 bit counter (0-3).
+ * Each update will increment this counter by one.
+ * When reading the top and bottom, if the two counter bits match then the
+ * top and bottom together make a valid 60 bit number.
+ */
+#define RB_TIME_SHIFT 30
+#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1)
+
+static inline int rb_time_cnt(unsigned long val)
+{
+ return (val >> RB_TIME_SHIFT) & 3;
+}
+
+static inline u64 rb_time_val(unsigned long top, unsigned long bottom)
+{
+ u64 val;
+
+ val = top & RB_TIME_VAL_MASK;
+ val <<= RB_TIME_SHIFT;
+ val |= bottom & RB_TIME_VAL_MASK;
+
+ return val;
+}
+
+static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
+{
+ unsigned long top, bottom;
+ unsigned long c;
+
+ /*
+ * If the read is interrupted by a write, then the cnt will
+ * be different. Loop until both top and bottom have been read
+ * without interruption.
+ */
+ do {
+ c = local_read(&t->cnt);
+ top = local_read(&t->top);
+ bottom = local_read(&t->bottom);
+ } while (c != local_read(&t->cnt));
+
+ *cnt = rb_time_cnt(top);
+
+ /* If top and bottom counts don't match, this interrupted a write */
+ if (*cnt != rb_time_cnt(bottom))
+ return false;
+
+ *ret = rb_time_val(top, bottom);
+ return true;
+}
+
+static bool rb_time_read(rb_time_t *t, u64 *ret)
+{
+ unsigned long cnt;
+
+ return __rb_time_read(t, ret, &cnt);
+}
+
+static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt)
+{
+ return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT);
+}
+
+static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom)
+{
+ *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK);
+ *bottom = (unsigned long)(val & RB_TIME_VAL_MASK);
+}
+
+static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt)
+{
+ val = rb_time_val_cnt(val, cnt);
+ local_set(t, val);
+}
+
+static void rb_time_set(rb_time_t *t, u64 val)
+{
+ unsigned long cnt, top, bottom;
+
+ rb_time_split(val, &top, &bottom);
+
+ /* Writes always succeed with a valid number even if it gets interrupted. */
+ do {
+ cnt = local_inc_return(&t->cnt);
+ rb_time_val_set(&t->top, top, cnt);
+ rb_time_val_set(&t->bottom, bottom, cnt);
+ } while (cnt != local_read(&t->cnt));
+}
+
+static inline bool
+rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
+{
+ unsigned long ret;
+
+ ret = local_cmpxchg(l, expect, set);
+ return ret == expect;
+}
+
+static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
+{
+ unsigned long cnt, top, bottom;
+ unsigned long cnt2, top2, bottom2;
+ u64 val;
+
+ /* The cmpxchg always fails if it interrupted an update */
+ if (!__rb_time_read(t, &val, &cnt2))
+ return false;
+
+ if (val != expect)
+ return false;
+
+ cnt = local_read(&t->cnt);
+ if ((cnt & 3) != cnt2)
+ return false;
+
+ cnt2 = cnt + 1;
+
+ rb_time_split(val, &top, &bottom);
+ top = rb_time_val_cnt(top, cnt);
+ bottom = rb_time_val_cnt(bottom, cnt);
+
+ rb_time_split(set, &top2, &bottom2);
+ top2 = rb_time_val_cnt(top2, cnt2);
+ bottom2 = rb_time_val_cnt(bottom2, cnt2);
+
+ if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2))
+ return false;
+ if (!rb_time_read_cmpxchg(&t->top, top, top2))
+ return false;
+ if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2))
+ return false;
+ return true;
+}
+
+#else /* 64 bits */
+
+/* local64_t always succeeds */
+
+static inline bool rb_time_read(rb_time_t *t, u64 *ret)
+{
+ *ret = local64_read(&t->time);
+ return true;
+}
+static void rb_time_set(rb_time_t *t, u64 val)
+{
+ local64_set(&t->time, val);
+}
+
+static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
+{
+ u64 val;
+ val = local64_cmpxchg(&t->time, expect, set);
+ return val == expect;
+}
+#endif
+
/**
* ring_buffer_nr_pages - get the number of buffer pages in the ring buffer
* @buffer: The ring_buffer to get the number of pages from
@@ -577,7 +801,7 @@ static void rb_wake_up_waiters(struct irq_work *work)
*/
int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
{
- struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer);
+ struct ring_buffer_per_cpu *cpu_buffer;
DEFINE_WAIT(wait);
struct rb_irq_work *work;
int ret = 0;
@@ -746,8 +970,16 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
static inline u64 rb_time_stamp(struct trace_buffer *buffer)
{
+ u64 ts;
+
+ /* Skip retpolines :-( */
+ if (IS_ENABLED(CONFIG_RETPOLINE) && likely(buffer->clock == trace_clock_local))
+ ts = trace_clock_local();
+ else
+ ts = buffer->clock();
+
/* shift to debug/test normalization and TIME_EXTENTS */
- return buffer->clock() << DEBUG_SHIFT;
+ return ts << DEBUG_SHIFT;
}
u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu)
@@ -2372,8 +2604,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
return NULL;
}
-/* Slow path, do not inline */
-static noinline struct ring_buffer_event *
+/* Slow path */
+static struct ring_buffer_event *
rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
{
if (abs)
@@ -2397,6 +2629,66 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event);
+#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static inline bool sched_clock_stable(void)
+{
+ return true;
+}
+#endif
+
+static void
+rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct rb_event_info *info)
+{
+ u64 write_stamp;
+
+ WARN_ONCE(1, "Delta way too big! %llu ts=%llu before=%llu after=%llu write stamp=%llu\n%s",
+ (unsigned long long)info->delta,
+ (unsigned long long)info->ts,
+ (unsigned long long)info->before,
+ (unsigned long long)info->after,
+ (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0),
+ sched_clock_stable() ? "" :
+ "If you just came from a suspend/resume,\n"
+ "please switch to the trace global clock:\n"
+ " echo global > /sys/kernel/debug/tracing/trace_clock\n"
+ "or add trace_clock=global to the kernel command line\n");
+}
+
+static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event **event,
+ struct rb_event_info *info,
+ u64 *delta,
+ unsigned int *length)
+{
+ bool abs = info->add_timestamp &
+ (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE);
+
+ if (unlikely(info->delta > (1ULL << 59))) {
+ /* did the clock go backwards */
+ if (info->before == info->after && info->before > info->ts) {
+ /* not interrupted */
+ static int once;
+
+ /*
+ * This is possible with a recalibrating of the TSC.
+ * Do not produce a call stack, but just report it.
+ */
+ if (!once) {
+ once++;
+ pr_warn("Ring buffer clock went backwards: %llu -> %llu\n",
+ info->before, info->ts);
+ }
+ } else
+ rb_check_timestamp(cpu_buffer, info);
+ if (!abs)
+ info->delta = 0;
+ }
+ *event = rb_add_time_stamp(*event, info->delta, abs);
+ *length -= RB_LEN_TIME_EXTEND;
+ *delta = 0;
+}
+
/**
* rb_update_event - update event type and data
* @cpu_buffer: The per cpu buffer of the @event
@@ -2416,21 +2708,12 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
unsigned length = info->length;
u64 delta = info->delta;
- /* Only a commit updates the timestamp */
- if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
- delta = 0;
-
/*
* If we need to add a timestamp, then we
* add it to the start of the reserved space.
*/
- if (unlikely(info->add_timestamp)) {
- bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer);
-
- event = rb_add_time_stamp(event, abs ? info->delta : delta, abs);
- length -= RB_LEN_TIME_EXTEND;
- delta = 0;
- }
+ if (unlikely(info->add_timestamp))
+ rb_add_timestamp(cpu_buffer, &event, info, &delta, &length);
event->time_delta = delta;
length -= RB_EVNT_HDR_SIZE;
@@ -2473,12 +2756,38 @@ static unsigned rb_calculate_event_length(unsigned length)
return length;
}
-#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-static inline bool sched_clock_stable(void)
+static __always_inline bool
+rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
{
- return true;
+ unsigned long addr = (unsigned long)event;
+ unsigned long index;
+
+ index = rb_event_index(event);
+ addr &= PAGE_MASK;
+
+ return cpu_buffer->commit_page->page == (void *)addr &&
+ rb_commit_index(cpu_buffer) == index;
+}
+
+static u64 rb_time_delta(struct ring_buffer_event *event)
+{
+ switch (event->type_len) {
+ case RINGBUF_TYPE_PADDING:
+ return 0;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ return ring_buffer_event_time_stamp(event);
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ return 0;
+
+ case RINGBUF_TYPE_DATA:
+ return event->time_delta;
+ default:
+ return 0;
+ }
}
-#endif
static inline int
rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2488,6 +2797,8 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *bpage;
unsigned long index;
unsigned long addr;
+ u64 write_stamp;
+ u64 delta;
new_index = rb_event_index(event);
old_index = new_index + rb_event_ts_length(event);
@@ -2496,10 +2807,32 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
bpage = READ_ONCE(cpu_buffer->tail_page);
+ delta = rb_time_delta(event);
+
+ if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp))
+ return 0;
+
+ /* Make sure the write stamp is read before testing the location */
+ barrier();
+
if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
unsigned long write_mask =
local_read(&bpage->write) & ~RB_WRITE_MASK;
unsigned long event_length = rb_event_length(event);
+
+ /* Something came in, can't discard */
+ if (!rb_time_cmpxchg(&cpu_buffer->write_stamp,
+ write_stamp, write_stamp - delta))
+ return 0;
+
+ /*
+ * If an event were to come in now, it would see that the
+ * write_stamp and the before_stamp are different, and assume
+ * that this event just added itself before updating
+ * the write stamp. The interrupting event will fix the
+ * write stamp for us, and use the before stamp as its delta.
+ */
+
/*
* This is on the tail page. It is possible that
* a write could come in and move the tail page
@@ -2551,10 +2884,6 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
- /* Only update the write stamp if the page has an event */
- if (rb_page_write(cpu_buffer->commit_page))
- cpu_buffer->write_stamp =
- cpu_buffer->commit_page->page->time_stamp;
/* add barrier to keep gcc from optimizing too much */
barrier();
}
@@ -2626,54 +2955,10 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
event->time_delta = 1;
}
-static __always_inline bool
-rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
-{
- unsigned long addr = (unsigned long)event;
- unsigned long index;
-
- index = rb_event_index(event);
- addr &= PAGE_MASK;
-
- return cpu_buffer->commit_page->page == (void *)addr &&
- rb_commit_index(cpu_buffer) == index;
-}
-
-static __always_inline void
-rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
-{
- u64 delta;
-
- /*
- * The event first in the commit queue updates the
- * time stamp.
- */
- if (rb_event_is_commit(cpu_buffer, event)) {
- /*
- * A commit event that is first on a page
- * updates the write timestamp with the page stamp
- */
- if (!rb_event_index(event))
- cpu_buffer->write_stamp =
- cpu_buffer->commit_page->page->time_stamp;
- else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
- delta = ring_buffer_event_time_stamp(event);
- cpu_buffer->write_stamp += delta;
- } else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) {
- delta = ring_buffer_event_time_stamp(event);
- cpu_buffer->write_stamp = delta;
- } else
- cpu_buffer->write_stamp += event->time_delta;
- }
-}
-
static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event)
{
local_inc(&cpu_buffer->entries);
- rb_update_write_stamp(cpu_buffer, event);
rb_end_commit(cpu_buffer);
}
@@ -2864,58 +3149,138 @@ int ring_buffer_unlock_commit(struct trace_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
-static noinline void
-rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
- struct rb_event_info *info)
-{
- WARN_ONCE(info->delta > (1ULL << 59),
- KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
- (unsigned long long)info->delta,
- (unsigned long long)info->ts,
- (unsigned long long)cpu_buffer->write_stamp,
- sched_clock_stable() ? "" :
- "If you just came from a suspend/resume,\n"
- "please switch to the trace global clock:\n"
- " echo global > /sys/kernel/debug/tracing/trace_clock\n"
- "or add trace_clock=global to the kernel command line\n");
- info->add_timestamp = 1;
-}
-
static struct ring_buffer_event *
__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
struct rb_event_info *info)
{
struct ring_buffer_event *event;
struct buffer_page *tail_page;
- unsigned long tail, write;
-
- /*
- * If the time delta since the last event is too big to
- * hold in the time field of the event, then we append a
- * TIME EXTEND event ahead of the data event.
- */
- if (unlikely(info->add_timestamp))
- info->length += RB_LEN_TIME_EXTEND;
+ unsigned long tail, write, w;
+ bool a_ok;
+ bool b_ok;
/* Don't let the compiler play games with cpu_buffer->tail_page */
tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
- write = local_add_return(info->length, &tail_page->write);
+
+ /*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK;
+ barrier();
+ b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
+ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+ barrier();
+ info->ts = rb_time_stamp(cpu_buffer->buffer);
+
+ if ((info->add_timestamp & RB_ADD_STAMP_ABSOLUTE)) {
+ info->delta = info->ts;
+ } else {
+ /*
+ * If interrupting an event time update, we may need an
+ * absolute timestamp.
+ * Don't bother if this is the start of a new page (w == 0).
+ */
+ if (unlikely(!a_ok || !b_ok || (info->before != info->after && w))) {
+ info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND;
+ info->length += RB_LEN_TIME_EXTEND;
+ } else {
+ info->delta = info->ts - info->after;
+ if (unlikely(test_time_stamp(info->delta))) {
+ info->add_timestamp |= RB_ADD_STAMP_EXTEND;
+ info->length += RB_LEN_TIME_EXTEND;
+ }
+ }
+ }
+
+ /*B*/ rb_time_set(&cpu_buffer->before_stamp, info->ts);
+
+ /*C*/ write = local_add_return(info->length, &tail_page->write);
/* set write to only the index of the write */
write &= RB_WRITE_MASK;
+
tail = write - info->length;
+ /* See if we shot pass the end of this buffer page */
+ if (unlikely(write > BUF_PAGE_SIZE)) {
+ if (tail != w) {
+ /* before and after may now different, fix it up*/
+ b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
+ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+ if (a_ok && b_ok && info->before != info->after)
+ (void)rb_time_cmpxchg(&cpu_buffer->before_stamp,
+ info->before, info->after);
+ }
+ return rb_move_tail(cpu_buffer, tail, info);
+ }
+
+ if (likely(tail == w)) {
+ u64 save_before;
+ bool s_ok;
+
+ /* Nothing interrupted us between A and C */
+ /*D*/ rb_time_set(&cpu_buffer->write_stamp, info->ts);
+ barrier();
+ /*E*/ s_ok = rb_time_read(&cpu_buffer->before_stamp, &save_before);
+ RB_WARN_ON(cpu_buffer, !s_ok);
+ if (likely(!(info->add_timestamp &
+ (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
+ /* This did not interrupt any time update */
+ info->delta = info->ts - info->after;
+ else
+ /* Just use full timestamp for inerrupting event */
+ info->delta = info->ts;
+ barrier();
+ if (unlikely(info->ts != save_before)) {
+ /* SLOW PATH - Interrupted between C and E */
+
+ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+ RB_WARN_ON(cpu_buffer, !a_ok);
+
+ /* Write stamp must only go forward */
+ if (save_before > info->after) {
+ /*
+ * We do not care about the result, only that
+ * it gets updated atomically.
+ */
+ (void)rb_time_cmpxchg(&cpu_buffer->write_stamp,
+ info->after, save_before);
+ }
+ }
+ } else {
+ u64 ts;
+ /* SLOW PATH - Interrupted between A and C */
+ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+ /* Was interrupted before here, write_stamp must be valid */
+ RB_WARN_ON(cpu_buffer, !a_ok);
+ ts = rb_time_stamp(cpu_buffer->buffer);
+ barrier();
+ /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
+ info->after < ts) {
+ /* Nothing came after this event between C and E */
+ info->delta = ts - info->after;
+ (void)rb_time_cmpxchg(&cpu_buffer->write_stamp,
+ info->after, info->ts);
+ info->ts = ts;
+ } else {
+ /*
+ * Interrupted beween C and E:
+ * Lost the previous events time stamp. Just set the
+ * delta to zero, and this will be the same time as
+ * the event this event interrupted. And the events that
+ * came after this will still be correct (as they would
+ * have built their delta on the previous event.
+ */
+ info->delta = 0;
+ }
+ info->add_timestamp &= ~RB_ADD_STAMP_FORCE;
+ }
+
/*
* If this is the first commit on the page, then it has the same
* timestamp as the page itself.
*/
- if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer))
+ if (unlikely(!tail && !(info->add_timestamp &
+ (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
info->delta = 0;
- /* See if we shot pass the end of this buffer page */
- if (unlikely(write > BUF_PAGE_SIZE))
- return rb_move_tail(cpu_buffer, tail, info);
-
/* We reserved something on the buffer */
event = __rb_page_index(tail_page, tail);
@@ -2927,7 +3292,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
* If this is the first commit on the page, then update
* its timestamp.
*/
- if (!tail)
+ if (unlikely(!tail))
tail_page->page->time_stamp = info->ts;
/* account for these added bytes */
@@ -2944,9 +3309,10 @@ rb_reserve_next_event(struct trace_buffer *buffer,
struct ring_buffer_event *event;
struct rb_event_info info;
int nr_loops = 0;
- u64 diff;
+ int add_ts_default;
rb_start_commit(cpu_buffer);
+ /* The commit page can not change after this */
#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
/*
@@ -2964,8 +3330,16 @@ rb_reserve_next_event(struct trace_buffer *buffer,
#endif
info.length = rb_calculate_event_length(length);
+
+ if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
+ add_ts_default = RB_ADD_STAMP_ABSOLUTE;
+ info.length += RB_LEN_TIME_EXTEND;
+ } else {
+ add_ts_default = RB_ADD_STAMP_NONE;
+ }
+
again:
- info.add_timestamp = 0;
+ info.add_timestamp = add_ts_default;
info.delta = 0;
/*
@@ -2980,35 +3354,16 @@ rb_reserve_next_event(struct trace_buffer *buffer,
if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
goto out_fail;
- info.ts = rb_time_stamp(cpu_buffer->buffer);
- diff = info.ts - cpu_buffer->write_stamp;
-
- /* make sure this diff is calculated here */
- barrier();
-
- if (ring_buffer_time_stamp_abs(buffer)) {
- info.delta = info.ts;
- rb_handle_timestamp(cpu_buffer, &info);
- } else /* Did the write stamp get updated already? */
- if (likely(info.ts >= cpu_buffer->write_stamp)) {
- info.delta = diff;
- if (unlikely(test_time_stamp(info.delta)))
- rb_handle_timestamp(cpu_buffer, &info);
- }
-
event = __rb_reserve_next(cpu_buffer, &info);
if (unlikely(PTR_ERR(event) == -EAGAIN)) {
- if (info.add_timestamp)
+ if (info.add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND))
info.length -= RB_LEN_TIME_EXTEND;
goto again;
}
- if (!event)
- goto out_fail;
-
- return event;
-
+ if (likely(event))
+ return event;
out_fail:
rb_end_commit(cpu_buffer);
return NULL;
@@ -3154,11 +3509,6 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer,
if (rb_try_to_discard(cpu_buffer, event))
goto out;
- /*
- * The commit is still visible by the reader, so we
- * must still update the timestamp.
- */
- rb_update_write_stamp(cpu_buffer, event);
out:
rb_end_commit(cpu_buffer);
@@ -4475,8 +4825,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->read = 0;
cpu_buffer->read_bytes = 0;
- cpu_buffer->write_stamp = 0;
- cpu_buffer->read_stamp = 0;
+ rb_time_set(&cpu_buffer->write_stamp, 0);
+ rb_time_set(&cpu_buffer->before_stamp, 0);
cpu_buffer->lost_events = 0;
cpu_buffer->last_overrun = 0;
@@ -4484,6 +4834,26 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
rb_head_page_activate(cpu_buffer);
}
+/* Must have disabled the cpu buffer then done a synchronize_rcu */
+static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+ if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
+ goto out;
+
+ arch_spin_lock(&cpu_buffer->lock);
+
+ rb_reset_cpu(cpu_buffer);
+
+ arch_spin_unlock(&cpu_buffer->lock);
+
+ out:
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+}
+
/**
* ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
* @buffer: The ring buffer to reset a per cpu buffer of
@@ -4492,7 +4862,6 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
- unsigned long flags;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return;
@@ -4503,24 +4872,42 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
/* Make sure all commits have finished */
synchronize_rcu();
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ reset_disabled_cpu_buffer(cpu_buffer);
- if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
- goto out;
+ atomic_dec(&cpu_buffer->record_disabled);
+ atomic_dec(&cpu_buffer->resize_disabled);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
- arch_spin_lock(&cpu_buffer->lock);
+/**
+ * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ * @buffer: The ring buffer to reset a per cpu buffer of
+ * @cpu: The CPU buffer to be reset
+ */
+void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
- rb_reset_cpu(cpu_buffer);
+ for_each_online_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
- arch_spin_unlock(&cpu_buffer->lock);
+ atomic_inc(&cpu_buffer->resize_disabled);
+ atomic_inc(&cpu_buffer->record_disabled);
+ }
- out:
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ /* Make sure all commits have finished */
+ synchronize_rcu();
- atomic_dec(&cpu_buffer->record_disabled);
- atomic_dec(&cpu_buffer->resize_disabled);
+ for_each_online_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ reset_disabled_cpu_buffer(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+ atomic_dec(&cpu_buffer->resize_disabled);
+ }
}
-EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
/**
* ring_buffer_reset - reset a ring buffer
@@ -4528,10 +4915,27 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
*/
void ring_buffer_reset(struct trace_buffer *buffer)
{
+ struct ring_buffer_per_cpu *cpu_buffer;
int cpu;
- for_each_buffer_cpu(buffer, cpu)
- ring_buffer_reset_cpu(buffer, cpu);
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ atomic_inc(&cpu_buffer->resize_disabled);
+ atomic_inc(&cpu_buffer->record_disabled);
+ }
+
+ /* Make sure all commits have finished */
+ synchronize_rcu();
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ reset_disabled_cpu_buffer(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+ atomic_dec(&cpu_buffer->resize_disabled);
+ }
}
EXPORT_SYMBOL_GPL(ring_buffer_reset);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 8df0aa810950..78e576575b79 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -45,8 +45,8 @@ MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
static int producer_nice = MAX_NICE;
static int consumer_nice = MAX_NICE;
-static int producer_fifo = -1;
-static int consumer_fifo = -1;
+static int producer_fifo;
+static int consumer_fifo;
module_param(producer_nice, int, 0644);
MODULE_PARM_DESC(producer_nice, "nice prio for producer");
@@ -55,10 +55,10 @@ module_param(consumer_nice, int, 0644);
MODULE_PARM_DESC(consumer_nice, "nice prio for consumer");
module_param(producer_fifo, int, 0644);
-MODULE_PARM_DESC(producer_fifo, "fifo prio for producer");
+MODULE_PARM_DESC(producer_fifo, "use fifo for producer: 0 - disabled, 1 - low prio, 2 - fifo");
module_param(consumer_fifo, int, 0644);
-MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer");
+MODULE_PARM_DESC(consumer_fifo, "use fifo for consumer: 0 - disabled, 1 - low prio, 2 - fifo");
static int read_events;
@@ -303,22 +303,22 @@ static void ring_buffer_producer(void)
trace_printk("ERROR!\n");
if (!disable_reader) {
- if (consumer_fifo < 0)
+ if (consumer_fifo)
+ trace_printk("Running Consumer at SCHED_FIFO %s\n",
+ consumer_fifo == 1 ? "low" : "high");
+ else
trace_printk("Running Consumer at nice: %d\n",
consumer_nice);
- else
- trace_printk("Running Consumer at SCHED_FIFO %d\n",
- consumer_fifo);
}
- if (producer_fifo < 0)
+ if (producer_fifo)
+ trace_printk("Running Producer at SCHED_FIFO %s\n",
+ producer_fifo == 1 ? "low" : "high");
+ else
trace_printk("Running Producer at nice: %d\n",
producer_nice);
- else
- trace_printk("Running Producer at SCHED_FIFO %d\n",
- producer_fifo);
/* Let the user know that the test is running at low priority */
- if (producer_fifo < 0 && consumer_fifo < 0 &&
+ if (!producer_fifo && !consumer_fifo &&
producer_nice == MAX_NICE && consumer_nice == MAX_NICE)
trace_printk("WARNING!!! This test is running at lowest priority.\n");
@@ -455,21 +455,19 @@ static int __init ring_buffer_benchmark_init(void)
* Run them as low-prio background tasks by default:
*/
if (!disable_reader) {
- if (consumer_fifo >= 0) {
- struct sched_param param = {
- .sched_priority = consumer_fifo
- };
- sched_setscheduler(consumer, SCHED_FIFO, &param);
- } else
+ if (consumer_fifo >= 2)
+ sched_set_fifo(consumer);
+ else if (consumer_fifo == 1)
+ sched_set_fifo_low(consumer);
+ else
set_user_nice(consumer, consumer_nice);
}
- if (producer_fifo >= 0) {
- struct sched_param param = {
- .sched_priority = producer_fifo
- };
- sched_setscheduler(producer, SCHED_FIFO, &param);
- } else
+ if (producer_fifo >= 2)
+ sched_set_fifo(producer);
+ else if (producer_fifo == 1)
+ sched_set_fifo_low(producer);
+ else
set_user_nice(producer, producer_nice);
return 0;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bb62269724d5..f40d850ebabc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1543,8 +1543,7 @@ static void latency_fsnotify_workfn(struct work_struct *work)
{
struct trace_array *tr = container_of(work, struct trace_array,
fsnotify_work);
- fsnotify(tr->d_max_latency->d_inode, FS_MODIFY,
- tr->d_max_latency->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+ fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY);
}
static void latency_fsnotify_workfn_irq(struct irq_work *iwork)
@@ -2003,7 +2002,6 @@ static void tracing_reset_cpu(struct array_buffer *buf, int cpu)
void tracing_reset_online_cpus(struct array_buffer *buf)
{
struct trace_buffer *buffer = buf->buffer;
- int cpu;
if (!buffer)
return;
@@ -2015,8 +2013,7 @@ void tracing_reset_online_cpus(struct array_buffer *buf)
buf->time_start = buffer_ftrace_now(buf, buf->cpu);
- for_each_online_cpu(cpu)
- ring_buffer_reset_cpu(buffer, cpu);
+ ring_buffer_reset_online_cpus(buffer);
ring_buffer_record_enable(buffer);
}
@@ -2932,12 +2929,6 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
skip++;
#endif
- /*
- * Since events can happen in NMIs there's no safe way to
- * use the per cpu ftrace_stacks. We reserve it and if an interrupt
- * or NMI comes in, it will just have to use the default
- * FTRACE_STACK_SIZE.
- */
preempt_disable_notrace();
stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1;
@@ -3137,6 +3128,9 @@ static int alloc_percpu_trace_buffer(void)
{
struct trace_buffer_struct *buffers;
+ if (trace_percpu_buffer)
+ return 0;
+
buffers = alloc_percpu(struct trace_buffer_struct);
if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer"))
return -ENOMEM;
@@ -3339,6 +3333,26 @@ int trace_array_vprintk(struct trace_array *tr,
return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args);
}
+/**
+ * trace_array_printk - Print a message to a specific instance
+ * @tr: The instance trace_array descriptor
+ * @ip: The instruction pointer that this is called from.
+ * @fmt: The format to print (printf format)
+ *
+ * If a subsystem sets up its own instance, they have the right to
+ * printk strings into their tracing instance buffer using this
+ * function. Note, this function will not write into the top level
+ * buffer (use trace_printk() for that), as writing into the top level
+ * buffer should only have events that can be individually disabled.
+ * trace_printk() is only used for debugging a kernel, and should not
+ * be ever encorporated in normal use.
+ *
+ * trace_array_printk() can be used, as it will not add noise to the
+ * top level tracing buffer.
+ *
+ * Note, trace_array_init_printk() must be called on @tr before this
+ * can be used.
+ */
__printf(3, 0)
int trace_array_printk(struct trace_array *tr,
unsigned long ip, const char *fmt, ...)
@@ -3346,12 +3360,16 @@ int trace_array_printk(struct trace_array *tr,
int ret;
va_list ap;
- if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
- return 0;
-
if (!tr)
return -ENOENT;
+ /* This is only allowed for created instances */
+ if (tr == &global_trace)
+ return 0;
+
+ if (!(tr->trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
va_start(ap, fmt);
ret = trace_array_vprintk(tr, ip, fmt, ap);
va_end(ap);
@@ -3359,6 +3377,27 @@ int trace_array_printk(struct trace_array *tr,
}
EXPORT_SYMBOL_GPL(trace_array_printk);
+/**
+ * trace_array_init_printk - Initialize buffers for trace_array_printk()
+ * @tr: The trace array to initialize the buffers for
+ *
+ * As trace_array_printk() only writes into instances, they are OK to
+ * have in the kernel (unlike trace_printk()). This needs to be called
+ * before trace_array_printk() can be used on a trace_array.
+ */
+int trace_array_init_printk(struct trace_array *tr)
+{
+ if (!tr)
+ return -ENOENT;
+
+ /* This is only allowed for created instances */
+ if (tr == &global_trace)
+ return -EINVAL;
+
+ return alloc_percpu_trace_buffer();
+}
+EXPORT_SYMBOL_GPL(trace_array_init_printk);
+
__printf(3, 4)
int trace_array_printk_buf(struct trace_buffer *buffer,
unsigned long ip, const char *fmt, ...)
@@ -5887,7 +5926,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
}
/* If trace pipe files are being read, we can't change the tracer */
- if (tr->current_trace->ref) {
+ if (tr->trace_ref) {
ret = -EBUSY;
goto out;
}
@@ -6103,7 +6142,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
nonseekable_open(inode, filp);
- tr->current_trace->ref++;
+ tr->trace_ref++;
out:
mutex_unlock(&trace_types_lock);
return ret;
@@ -6122,7 +6161,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
mutex_lock(&trace_types_lock);
- tr->current_trace->ref--;
+ tr->trace_ref--;
if (iter->trace->pipe_close)
iter->trace->pipe_close(iter);
@@ -7406,7 +7445,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
if (ret)
return ret;
- info = kzalloc(sizeof(*info), GFP_KERNEL);
+ info = kvzalloc(sizeof(*info), GFP_KERNEL);
if (!info) {
trace_array_put(tr);
return -ENOMEM;
@@ -7424,7 +7463,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
filp->private_data = info;
- tr->current_trace->ref++;
+ tr->trace_ref++;
mutex_unlock(&trace_types_lock);
@@ -7525,14 +7564,14 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
mutex_lock(&trace_types_lock);
- iter->tr->current_trace->ref--;
+ iter->tr->trace_ref--;
__trace_array_put(iter->tr);
if (info->spare)
ring_buffer_free_read_page(iter->array_buffer->buffer,
info->spare_cpu, info->spare);
- kfree(info);
+ kvfree(info);
mutex_unlock(&trace_types_lock);
@@ -8733,7 +8772,7 @@ static int __remove_instance(struct trace_array *tr)
int i;
/* Reference counter for a newly created trace array = 1. */
- if (tr->ref > 1 || (tr->current_trace && tr->current_trace->ref))
+ if (tr->ref > 1 || (tr->current_trace && tr->trace_ref))
return -EBUSY;
list_del(&tr->list);
@@ -8945,9 +8984,7 @@ struct dentry *tracing_init_dentry(void)
if (tr->dir)
return NULL;
- if (WARN_ON(!tracefs_initialized()) ||
- (IS_ENABLED(CONFIG_DEBUG_FS) &&
- WARN_ON(!debugfs_initialized())))
+ if (WARN_ON(!tracefs_initialized()))
return ERR_PTR(-ENODEV);
/*
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 13db4000af3f..610d21355526 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -356,6 +356,7 @@ struct trace_array {
struct trace_event_file *trace_marker_file;
cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
int ref;
+ int trace_ref;
#ifdef CONFIG_FUNCTION_TRACER
struct ftrace_ops *ops;
struct trace_pid_list __rcu *function_pids;
@@ -547,7 +548,6 @@ struct tracer {
struct tracer *next;
struct tracer_flags *flags;
int enabled;
- int ref;
bool print_max;
bool allow_instances;
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -1103,6 +1103,10 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
extern struct list_head ftrace_pids;
#ifdef CONFIG_FUNCTION_TRACER
+
+#define FTRACE_PID_IGNORE -1
+#define FTRACE_PID_TRACE -2
+
struct ftrace_func_command {
struct list_head list;
char *name;
@@ -1114,7 +1118,8 @@ struct ftrace_func_command {
extern bool ftrace_filter_param __initdata;
static inline int ftrace_trace_task(struct trace_array *tr)
{
- return !this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid);
+ return this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid) !=
+ FTRACE_PID_IGNORE;
}
extern int ftrace_is_dead(void);
int ftrace_create_function_files(struct trace_array *tr,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f6f55682d3e2..a85effb2373b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -538,12 +538,12 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable)
if (enable) {
register_trace_prio_sched_process_fork(event_filter_pid_sched_process_fork,
tr, INT_MIN);
- register_trace_prio_sched_process_exit(event_filter_pid_sched_process_exit,
+ register_trace_prio_sched_process_free(event_filter_pid_sched_process_exit,
tr, INT_MAX);
} else {
unregister_trace_sched_process_fork(event_filter_pid_sched_process_fork,
tr);
- unregister_trace_sched_process_exit(event_filter_pid_sched_process_exit,
+ unregister_trace_sched_process_free(event_filter_pid_sched_process_exit,
tr);
}
}
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index e2be7bb7ef7e..17873e5d0353 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -283,6 +283,7 @@ static bool disable_migrate;
static void move_to_next_cpu(void)
{
struct cpumask *current_mask = &save_cpumask;
+ struct trace_array *tr = hwlat_trace;
int next_cpu;
if (disable_migrate)
@@ -296,7 +297,7 @@ static void move_to_next_cpu(void)
goto disable;
get_online_cpus();
- cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
+ cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
next_cpu = cpumask_next(smp_processor_id(), current_mask);
put_online_cpus();
@@ -371,9 +372,8 @@ static int start_kthread(struct trace_array *tr)
return 0;
/* Just pick the first CPU on first iteration */
- current_mask = &save_cpumask;
get_online_cpus();
- cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
+ cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
put_online_cpus();
next_cpu = cpumask_first(current_mask);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 73976de7f8cc..4d1893564912 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -20,7 +20,7 @@ DECLARE_RWSEM(trace_event_sem);
static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
-static int next_event_type = __TRACE_LAST_TYPE + 1;
+static int next_event_type = __TRACE_LAST_TYPE;
enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
{
@@ -675,11 +675,11 @@ static LIST_HEAD(ftrace_event_list);
static int trace_search_list(struct list_head **list)
{
struct trace_event *e;
- int last = __TRACE_LAST_TYPE;
+ int next = __TRACE_LAST_TYPE;
if (list_empty(&ftrace_event_list)) {
*list = &ftrace_event_list;
- return last + 1;
+ return next;
}
/*
@@ -687,17 +687,17 @@ static int trace_search_list(struct list_head **list)
* lets see if somebody freed one.
*/
list_for_each_entry(e, &ftrace_event_list, list) {
- if (e->type != last + 1)
+ if (e->type != next)
break;
- last++;
+ next++;
}
/* Did we used up all 65 thousand events??? */
- if ((last + 1) > TRACE_EVENT_TYPE_MAX)
+ if (next > TRACE_EVENT_TYPE_MAX)
return 0;
*list = &e->list;
- return last + 1;
+ return next;
}
void trace_event_read_lock(void)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index fdd47f99b18f..f4286c9bdeb4 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1456,7 +1456,6 @@ trace_uprobe_register(struct trace_event_call *event, enum trace_reg type,
default:
return 0;
}
- return 0;
}
static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
diff --git a/kernel/umh.c b/kernel/umh.c
index 79f139a7ca03..fcf3ee803630 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -26,8 +26,6 @@
#include <linux/ptrace.h>
#include <linux/async.h>
#include <linux/uaccess.h>
-#include <linux/shmem_fs.h>
-#include <linux/pipe_fs_i.h>
#include <trace/events/module.h>
@@ -38,8 +36,6 @@ static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
static DEFINE_SPINLOCK(umh_sysctl_lock);
static DECLARE_RWSEM(umhelper_sem);
-static LIST_HEAD(umh_list);
-static DEFINE_MUTEX(umh_list_lock);
static void call_usermodehelper_freeinfo(struct subprocess_info *info)
{
@@ -102,16 +98,9 @@ static int call_usermodehelper_exec_async(void *data)
commit_creds(new);
- sub_info->pid = task_pid_nr(current);
- if (sub_info->file) {
- retval = do_execve_file(sub_info->file,
- sub_info->argv, sub_info->envp);
- if (!retval)
- current->flags |= PF_UMH;
- } else
- retval = do_execve(getname_kernel(sub_info->path),
- (const char __user *const __user *)sub_info->argv,
- (const char __user *const __user *)sub_info->envp);
+ retval = kernel_execve(sub_info->path,
+ (const char *const *)sub_info->argv,
+ (const char *const *)sub_info->envp);
out:
sub_info->retval = retval;
/*
@@ -130,37 +119,16 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
{
pid_t pid;
- /* If SIGCLD is ignored kernel_wait4 won't populate the status. */
+ /* If SIGCLD is ignored do_wait won't populate the status. */
kernel_sigaction(SIGCHLD, SIG_DFL);
pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
- if (pid < 0) {
+ if (pid < 0)
sub_info->retval = pid;
- } else {
- int ret = -ECHILD;
- /*
- * Normally it is bogus to call wait4() from in-kernel because
- * wait4() wants to write the exit code to a userspace address.
- * But call_usermodehelper_exec_sync() always runs as kernel
- * thread (workqueue) and put_user() to a kernel address works
- * OK for kernel threads, due to their having an mm_segment_t
- * which spans the entire address space.
- *
- * Thus the __user pointer cast is valid here.
- */
- kernel_wait4(pid, (int __user *)&ret, 0, NULL);
-
- /*
- * If ret is 0, either call_usermodehelper_exec_async failed and
- * the real error code is already in sub_info->retval or
- * sub_info->retval is 0 anyway, so don't mess with it then.
- */
- if (ret)
- sub_info->retval = ret;
- }
+ else
+ kernel_wait(pid, &sub_info->retval);
/* Restore default kernel sig handler */
kernel_sigaction(SIGCHLD, SIG_IGN);
-
umh_complete(sub_info);
}
@@ -405,140 +373,6 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
}
EXPORT_SYMBOL(call_usermodehelper_setup);
-struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
- int (*init)(struct subprocess_info *info, struct cred *new),
- void (*cleanup)(struct subprocess_info *info), void *data)
-{
- struct subprocess_info *sub_info;
- struct umh_info *info = data;
- const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
-
- sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL);
- if (!sub_info)
- return NULL;
-
- sub_info->argv = argv_split(GFP_KERNEL, cmdline, NULL);
- if (!sub_info->argv) {
- kfree(sub_info);
- return NULL;
- }
-
- INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
- sub_info->path = "none";
- sub_info->file = file;
- sub_info->init = init;
- sub_info->cleanup = cleanup;
- sub_info->data = data;
- return sub_info;
-}
-
-static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
-{
- struct umh_info *umh_info = info->data;
- struct file *from_umh[2];
- struct file *to_umh[2];
- int err;
-
- /* create pipe to send data to umh */
- err = create_pipe_files(to_umh, 0);
- if (err)
- return err;
- err = replace_fd(0, to_umh[0], 0);
- fput(to_umh[0]);
- if (err < 0) {
- fput(to_umh[1]);
- return err;
- }
-
- /* create pipe to receive data from umh */
- err = create_pipe_files(from_umh, 0);
- if (err) {
- fput(to_umh[1]);
- replace_fd(0, NULL, 0);
- return err;
- }
- err = replace_fd(1, from_umh[1], 0);
- fput(from_umh[1]);
- if (err < 0) {
- fput(to_umh[1]);
- replace_fd(0, NULL, 0);
- fput(from_umh[0]);
- return err;
- }
-
- umh_info->pipe_to_umh = to_umh[1];
- umh_info->pipe_from_umh = from_umh[0];
- return 0;
-}
-
-static void umh_clean_and_save_pid(struct subprocess_info *info)
-{
- struct umh_info *umh_info = info->data;
-
- /* cleanup if umh_pipe_setup() was successful but exec failed */
- if (info->pid && info->retval) {
- fput(umh_info->pipe_to_umh);
- fput(umh_info->pipe_from_umh);
- }
-
- argv_free(info->argv);
- umh_info->pid = info->pid;
-}
-
-/**
- * fork_usermode_blob - fork a blob of bytes as a usermode process
- * @data: a blob of bytes that can be do_execv-ed as a file
- * @len: length of the blob
- * @info: information about usermode process (shouldn't be NULL)
- *
- * If info->cmdline is set it will be used as command line for the
- * user process, else "usermodehelper" is used.
- *
- * Returns either negative error or zero which indicates success
- * in executing a blob of bytes as a usermode process. In such
- * case 'struct umh_info *info' is populated with two pipes
- * and a pid of the process. The caller is responsible for health
- * check of the user process, killing it via pid, and closing the
- * pipes when user process is no longer needed.
- */
-int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
-{
- struct subprocess_info *sub_info;
- struct file *file;
- ssize_t written;
- loff_t pos = 0;
- int err;
-
- file = shmem_kernel_file_setup("", len, 0);
- if (IS_ERR(file))
- return PTR_ERR(file);
-
- written = kernel_write(file, data, len, &pos);
- if (written != len) {
- err = written;
- if (err >= 0)
- err = -ENOMEM;
- goto out;
- }
-
- err = -ENOMEM;
- sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup,
- umh_clean_and_save_pid, info);
- if (!sub_info)
- goto out;
-
- err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
- if (!err) {
- mutex_lock(&umh_list_lock);
- list_add(&info->list, &umh_list);
- mutex_unlock(&umh_list_lock);
- }
-out:
- fput(file);
- return err;
-}
-EXPORT_SYMBOL_GPL(fork_usermode_blob);
-
/**
* call_usermodehelper_exec - start a usermode application
* @sub_info: information about the subprocessa
@@ -700,26 +534,6 @@ static int proc_cap_handler(struct ctl_table *table, int write,
return 0;
}
-void __exit_umh(struct task_struct *tsk)
-{
- struct umh_info *info;
- pid_t pid = tsk->pid;
-
- mutex_lock(&umh_list_lock);
- list_for_each_entry(info, &umh_list, list) {
- if (info->pid == pid) {
- list_del(&info->list);
- mutex_unlock(&umh_list_lock);
- goto out;
- }
- }
- mutex_unlock(&umh_list_lock);
- return;
-out:
- if (info->cleanup)
- info->cleanup(info);
-}
-
struct ctl_table usermodehelper_table[] = {
{
.procname = "bset",
diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
new file mode 100644
index 000000000000..0b35212ffc3d
--- /dev/null
+++ b/kernel/usermode_driver.c
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * umd - User mode driver support
+ */
+#include <linux/shmem_fs.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/mount.h>
+#include <linux/fs_struct.h>
+#include <linux/task_work.h>
+#include <linux/usermode_driver.h>
+
+static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *name)
+{
+ struct file_system_type *type;
+ struct vfsmount *mnt;
+ struct file *file;
+ ssize_t written;
+ loff_t pos = 0;
+
+ type = get_fs_type("tmpfs");
+ if (!type)
+ return ERR_PTR(-ENODEV);
+
+ mnt = kern_mount(type);
+ put_filesystem(type);
+ if (IS_ERR(mnt))
+ return mnt;
+
+ file = file_open_root(mnt->mnt_root, mnt, name, O_CREAT | O_WRONLY, 0700);
+ if (IS_ERR(file)) {
+ mntput(mnt);
+ return ERR_CAST(file);
+ }
+
+ written = kernel_write(file, data, len, &pos);
+ if (written != len) {
+ int err = written;
+ if (err >= 0)
+ err = -ENOMEM;
+ filp_close(file, NULL);
+ mntput(mnt);
+ return ERR_PTR(err);
+ }
+
+ fput(file);
+
+ /* Flush delayed fput so exec can open the file read-only */
+ flush_delayed_fput();
+ task_work_run();
+ return mnt;
+}
+
+/**
+ * umd_load_blob - Remember a blob of bytes for fork_usermode_driver
+ * @info: information about usermode driver
+ * @data: a blob of bytes that can be executed as a file
+ * @len: The lentgh of the blob
+ *
+ */
+int umd_load_blob(struct umd_info *info, const void *data, size_t len)
+{
+ struct vfsmount *mnt;
+
+ if (WARN_ON_ONCE(info->wd.dentry || info->wd.mnt))
+ return -EBUSY;
+
+ mnt = blob_to_mnt(data, len, info->driver_name);
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+
+ info->wd.mnt = mnt;
+ info->wd.dentry = mnt->mnt_root;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(umd_load_blob);
+
+/**
+ * umd_unload_blob - Disassociate @info from a previously loaded blob
+ * @info: information about usermode driver
+ *
+ */
+int umd_unload_blob(struct umd_info *info)
+{
+ if (WARN_ON_ONCE(!info->wd.mnt ||
+ !info->wd.dentry ||
+ info->wd.mnt->mnt_root != info->wd.dentry))
+ return -EINVAL;
+
+ kern_unmount(info->wd.mnt);
+ info->wd.mnt = NULL;
+ info->wd.dentry = NULL;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(umd_unload_blob);
+
+static int umd_setup(struct subprocess_info *info, struct cred *new)
+{
+ struct umd_info *umd_info = info->data;
+ struct file *from_umh[2];
+ struct file *to_umh[2];
+ int err;
+
+ /* create pipe to send data to umh */
+ err = create_pipe_files(to_umh, 0);
+ if (err)
+ return err;
+ err = replace_fd(0, to_umh[0], 0);
+ fput(to_umh[0]);
+ if (err < 0) {
+ fput(to_umh[1]);
+ return err;
+ }
+
+ /* create pipe to receive data from umh */
+ err = create_pipe_files(from_umh, 0);
+ if (err) {
+ fput(to_umh[1]);
+ replace_fd(0, NULL, 0);
+ return err;
+ }
+ err = replace_fd(1, from_umh[1], 0);
+ fput(from_umh[1]);
+ if (err < 0) {
+ fput(to_umh[1]);
+ replace_fd(0, NULL, 0);
+ fput(from_umh[0]);
+ return err;
+ }
+
+ set_fs_pwd(current->fs, &umd_info->wd);
+ umd_info->pipe_to_umh = to_umh[1];
+ umd_info->pipe_from_umh = from_umh[0];
+ umd_info->tgid = get_pid(task_tgid(current));
+ return 0;
+}
+
+static void umd_cleanup(struct subprocess_info *info)
+{
+ struct umd_info *umd_info = info->data;
+
+ /* cleanup if umh_setup() was successful but exec failed */
+ if (info->retval) {
+ fput(umd_info->pipe_to_umh);
+ fput(umd_info->pipe_from_umh);
+ put_pid(umd_info->tgid);
+ umd_info->tgid = NULL;
+ }
+}
+
+/**
+ * fork_usermode_driver - fork a usermode driver
+ * @info: information about usermode driver (shouldn't be NULL)
+ *
+ * Returns either negative error or zero which indicates success in
+ * executing a usermode driver. In such case 'struct umd_info *info'
+ * is populated with two pipes and a tgid of the process. The caller is
+ * responsible for health check of the user process, killing it via
+ * tgid, and closing the pipes when user process is no longer needed.
+ */
+int fork_usermode_driver(struct umd_info *info)
+{
+ struct subprocess_info *sub_info;
+ const char *argv[] = { info->driver_name, NULL };
+ int err;
+
+ if (WARN_ON_ONCE(info->tgid))
+ return -EBUSY;
+
+ err = -ENOMEM;
+ sub_info = call_usermodehelper_setup(info->driver_name,
+ (char **)argv, NULL, GFP_KERNEL,
+ umd_setup, umd_cleanup, info);
+ if (!sub_info)
+ goto out;
+
+ err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(fork_usermode_driver);
+
+