summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.kexec150
-rw-r--r--kernel/acct.c4
-rw-r--r--kernel/audit.c5
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/auditfilter.c19
-rw-r--r--kernel/auditsc.c10
-rw-r--r--kernel/bpf/btf.c2
-rw-r--r--kernel/capability.c2
-rw-r--r--kernel/cgroup/cgroup-v1.c2
-rw-r--r--kernel/cgroup/cgroup.c109
-rw-r--r--kernel/cgroup/cpuset.c264
-rw-r--r--kernel/cgroup/misc.c55
-rw-r--r--kernel/cgroup/namespace.c6
-rw-r--r--kernel/cgroup/rstat.c12
-rw-r--r--kernel/configs/debug.config2
-rw-r--r--kernel/configs/kvm_guest.config1
-rw-r--r--kernel/configs/nopm.config2
-rw-r--r--kernel/configs/rust.config1
-rw-r--r--kernel/configs/tiny-base.config2
-rw-r--r--kernel/configs/x86_debug.config1
-rw-r--r--kernel/configs/xen.config2
-rw-r--r--kernel/cpu.c24
-rw-r--r--kernel/crash_core.c395
-rw-r--r--kernel/cred.c27
-rw-r--r--kernel/debug/debug_core.c2
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/dma/Kconfig26
-rw-r--r--kernel/dma/contiguous.c108
-rw-r--r--kernel/dma/direct.c2
-rw-r--r--kernel/dma/mapping.c6
-rw-r--r--kernel/dma/swiotlb.c698
-rw-r--r--kernel/events/core.c33
-rw-r--r--kernel/events/hw_breakpoint.c28
-rw-r--r--kernel/events/uprobes.c2
-rw-r--r--kernel/fork.c37
-rw-r--r--kernel/futex/core.c3
-rw-r--r--kernel/gcov/Makefile2
-rw-r--r--kernel/iomem.c13
-rw-r--r--kernel/kallsyms_selftest.c22
-rw-r--r--kernel/kexec.c5
-rw-r--r--kernel/kexec_core.c43
-rw-r--r--kernel/kexec_file.c195
-rw-r--r--kernel/kprobes.c6
-rw-r--r--kernel/ksysfs.c15
-rw-r--r--kernel/kthread.c3
-rw-r--r--kernel/locking/lockdep.c36
-rw-r--r--kernel/module/decompress.c4
-rw-r--r--kernel/module/main.c16
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/params.c2
-rw-r--r--kernel/pid.c3
-rw-r--r--kernel/pid_namespace.c6
-rw-r--r--kernel/pid_sysctl.h28
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/printk/internal.h2
-rw-r--r--kernel/printk/printk.c215
-rw-r--r--kernel/printk/printk_ringbuffer.c2
-rw-r--r--kernel/printk/printk_safe.c9
-rw-r--r--kernel/rcu/tree_stall.h2
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/sched/fair.c2
-rw-r--r--kernel/signal.c13
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/time.c169
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/ftrace.c10
-rw-r--r--kernel/trace/ring_buffer.c20
-rw-r--r--kernel/trace/trace.c123
-rw-r--r--kernel/trace/trace.h14
-rw-r--r--kernel/trace/trace_btf.c122
-rw-r--r--kernel/trace/trace_btf.h11
-rw-r--r--kernel/trace/trace_entries.h2
-rw-r--r--kernel/trace/trace_eprobe.c22
-rw-r--r--kernel/trace/trace_events.c76
-rw-r--r--kernel/trace/trace_events_filter.c315
-rw-r--r--kernel/trace/trace_events_user.c15
-rw-r--r--kernel/trace/trace_export.c9
-rw-r--r--kernel/trace/trace_fprobe.c59
-rw-r--r--kernel/trace/trace_hwlat.c2
-rw-r--r--kernel/trace/trace_kprobe.c1
-rw-r--r--kernel/trace/trace_probe.c499
-rw-r--r--kernel/trace/trace_probe.h27
-rw-r--r--kernel/trace/trace_uprobe.c1
-rw-r--r--kernel/ucount.c5
-rw-r--r--kernel/watchdog.c11
-rw-r--r--kernel/workqueue.c1616
-rw-r--r--kernel/workqueue_internal.h2
88 files changed, 3985 insertions, 1846 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
new file mode 100644
index 000000000000..9bfe68fe9676
--- /dev/null
+++ b/kernel/Kconfig.kexec
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+menu "Kexec and crash features"
+
+config CRASH_CORE
+ bool
+
+config KEXEC_CORE
+ select CRASH_CORE
+ bool
+
+config KEXEC_ELF
+ bool
+
+config HAVE_IMA_KEXEC
+ bool
+
+config KEXEC
+ bool "Enable kexec system call"
+ depends on ARCH_SUPPORTS_KEXEC
+ select KEXEC_CORE
+ help
+ kexec is a system call that implements the ability to shutdown your
+ current kernel, and to start another kernel. It is like a reboot
+ but it is independent of the system firmware. And like a reboot
+ you can start any kernel with it, not just Linux.
+
+ The name comes from the similarity to the exec system call.
+
+ It is an ongoing process to be certain the hardware in a machine
+ is properly shutdown, so do not be surprised if this code does not
+ initially work for you. As of this writing the exact hardware
+ interface is strongly in flux, so no good recommendation can be
+ made.
+
+config KEXEC_FILE
+ bool "Enable kexec file based system call"
+ depends on ARCH_SUPPORTS_KEXEC_FILE
+ select KEXEC_CORE
+ help
+ This is new version of kexec system call. This system call is
+ file based and takes file descriptors as system call argument
+ for kernel and initramfs as opposed to list of segments as
+ accepted by kexec system call.
+
+config KEXEC_SIG
+ bool "Verify kernel signature during kexec_file_load() syscall"
+ depends on ARCH_SUPPORTS_KEXEC_SIG
+ depends on KEXEC_FILE
+ help
+ This option makes the kexec_file_load() syscall check for a valid
+ signature of the kernel image. The image can still be loaded without
+ a valid signature unless you also enable KEXEC_SIG_FORCE, though if
+ there's a signature that we can check, then it must be valid.
+
+ In addition to this option, you need to enable signature
+ verification for the corresponding kernel image type being
+ loaded in order for this to work.
+
+config KEXEC_SIG_FORCE
+ bool "Require a valid signature in kexec_file_load() syscall"
+ depends on ARCH_SUPPORTS_KEXEC_SIG_FORCE
+ depends on KEXEC_SIG
+ help
+ This option makes kernel signature verification mandatory for
+ the kexec_file_load() syscall.
+
+config KEXEC_IMAGE_VERIFY_SIG
+ bool "Enable Image signature verification support (ARM)"
+ default ARCH_DEFAULT_KEXEC_IMAGE_VERIFY_SIG
+ depends on ARCH_SUPPORTS_KEXEC_IMAGE_VERIFY_SIG
+ depends on KEXEC_SIG
+ depends on EFI && SIGNED_PE_FILE_VERIFICATION
+ help
+ Enable Image signature verification support.
+
+config KEXEC_BZIMAGE_VERIFY_SIG
+ bool "Enable bzImage signature verification support"
+ depends on ARCH_SUPPORTS_KEXEC_BZIMAGE_VERIFY_SIG
+ depends on KEXEC_SIG
+ depends on SIGNED_PE_FILE_VERIFICATION
+ select SYSTEM_TRUSTED_KEYRING
+ help
+ Enable bzImage signature verification support.
+
+config KEXEC_JUMP
+ bool "kexec jump"
+ depends on ARCH_SUPPORTS_KEXEC_JUMP
+ depends on KEXEC && HIBERNATION
+ help
+ Jump between original kernel and kexeced kernel and invoke
+ code in physical address mode via KEXEC
+
+config CRASH_DUMP
+ bool "kernel crash dumps"
+ depends on ARCH_SUPPORTS_CRASH_DUMP
+ depends on ARCH_SUPPORTS_KEXEC
+ select CRASH_CORE
+ select KEXEC_CORE
+ select KEXEC
+ help
+ Generate crash dump after being started by kexec.
+ This should be normally only set in special crash dump kernels
+ which are loaded in the main kernel with kexec-tools into
+ a specially reserved region and then later executed after
+ a crash by kdump/kexec. The crash dump kernel must be compiled
+ to a memory address not used by the main kernel or BIOS using
+ PHYSICAL_START, or it must be built as a relocatable image
+ (CONFIG_RELOCATABLE=y).
+ For more details see Documentation/admin-guide/kdump/kdump.rst
+
+ For s390, this option also enables zfcpdump.
+ See also <file:Documentation/s390/zfcpdump.rst>
+
+config CRASH_HOTPLUG
+ bool "Update the crash elfcorehdr on system configuration changes"
+ default y
+ depends on CRASH_DUMP && (HOTPLUG_CPU || MEMORY_HOTPLUG)
+ depends on ARCH_SUPPORTS_CRASH_HOTPLUG
+ help
+ Enable direct update to the crash elfcorehdr (which contains
+ the list of CPUs and memory regions to be dumped upon a crash)
+ in response to hot plug/unplug or online/offline of CPUs or
+ memory. This is a much more advanced approach than userspace
+ attempting that.
+
+ If unsure, say Y.
+
+config CRASH_MAX_MEMORY_RANGES
+ int "Specify the maximum number of memory regions for the elfcorehdr"
+ default 8192
+ depends on CRASH_HOTPLUG
+ help
+ For the kexec_file_load() syscall path, specify the maximum number of
+ memory regions that the elfcorehdr buffer/segment can accommodate.
+ These regions are obtained via walk_system_ram_res(); eg. the
+ 'System RAM' entries in /proc/iomem.
+ This value is combined with NR_CPUS_DEFAULT and multiplied by
+ sizeof(Elf64_Phdr) to determine the final elfcorehdr memory buffer/
+ segment size.
+ The value 8192, for example, covers a (sparsely populated) 1TiB system
+ consisting of 128MiB memblocks, while resulting in an elfcorehdr
+ memory buffer/segment size under 1MiB. This represents a sane choice
+ to accommodate both baremetal and virtual machine configurations.
+
+ For the kexec_load() syscall path, CRASH_MAX_MEMORY_RANGES is part of
+ the computation behind the value provided through the
+ /sys/kernel/crash_elfcorehdr_size attribute.
+
+endmenu
diff --git a/kernel/acct.c b/kernel/acct.c
index 010667ce6080..1a9f929fe629 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -445,7 +445,7 @@ static void fill_ac(acct_t *ac)
memset(ac, 0, sizeof(acct_t));
ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER;
- strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
+ strscpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
/* calculate run_time in nsec*/
run_time = ktime_get_ns();
@@ -470,7 +470,7 @@ static void fill_ac(acct_t *ac)
do_div(elapsed, AHZ);
btime = ktime_get_real_seconds() - elapsed;
ac->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX);
-#if ACCT_VERSION==2
+#if ACCT_VERSION == 2
ac->ac_ahz = AHZ;
#endif
diff --git a/kernel/audit.c b/kernel/audit.c
index 9bc0b0301198..16205dd29843 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -53,9 +53,7 @@
#include <net/sock.h>
#include <net/netlink.h>
#include <linux/skbuff.h>
-#ifdef CONFIG_SECURITY
#include <linux/security.h>
-#endif
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
#include <net/netns/generic.h>
@@ -323,7 +321,8 @@ static inline int audit_rate_check(void)
unsigned long now;
int retval = 0;
- if (!audit_rate_limit) return 1;
+ if (!audit_rate_limit)
+ return 1;
spin_lock_irqsave(&lock, flags);
if (++messages < audit_rate_limit) {
diff --git a/kernel/audit.h b/kernel/audit.h
index 94738bce40b2..a60d2840559e 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -334,7 +334,7 @@ static inline int audit_signal_info_syscall(struct task_struct *t)
return 0;
}
-#define audit_filter_inodes(t, c) AUDIT_STATE_DISABLED
+#define audit_filter_inodes(t, c) do { } while (0)
#endif /* CONFIG_AUDITSYSCALL */
extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 42d99896e7a6..8317a37dea0b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -221,7 +221,7 @@ static int audit_match_signal(struct audit_entry *entry)
entry->rule.mask));
}
- switch(audit_classify_arch(arch->val)) {
+ switch (audit_classify_arch(arch->val)) {
case 0: /* native */
return (audit_match_class_bits(AUDIT_CLASS_SIGNAL,
entry->rule.mask));
@@ -243,7 +243,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *
err = -EINVAL;
listnr = rule->flags & ~AUDIT_FILTER_PREPEND;
- switch(listnr) {
+ switch (listnr) {
default:
goto exit_err;
#ifdef CONFIG_AUDITSYSCALL
@@ -344,7 +344,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
switch (entry->rule.listnr) {
case AUDIT_FILTER_FS:
- switch(f->type) {
+ switch (f->type) {
case AUDIT_FSTYPE:
case AUDIT_FILTERKEY:
break;
@@ -651,7 +651,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
data->fields[i] = f->type;
data->fieldflags[i] = audit_ops[f->op];
- switch(f->type) {
+ switch (f->type) {
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
@@ -694,7 +694,8 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
data->values[i] = f->val;
}
}
- for (i = 0; i < AUDIT_BITMASK_SIZE; i++) data->mask[i] = krule->mask[i];
+ for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+ data->mask[i] = krule->mask[i];
return data;
}
@@ -717,7 +718,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
a->fields[i].op != b->fields[i].op)
return 1;
- switch(a->fields[i].type) {
+ switch (a->fields[i].type) {
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
@@ -946,7 +947,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
int dont_count = 0;
/* If any of these, don't count towards total */
- switch(entry->rule.listnr) {
+ switch (entry->rule.listnr) {
case AUDIT_FILTER_USER:
case AUDIT_FILTER_EXCLUDE:
case AUDIT_FILTER_FS:
@@ -1029,7 +1030,7 @@ int audit_del_rule(struct audit_entry *entry)
int dont_count = 0;
/* If any of these, don't count towards total */
- switch(entry->rule.listnr) {
+ switch (entry->rule.listnr) {
case AUDIT_FILTER_USER:
case AUDIT_FILTER_EXCLUDE:
case AUDIT_FILTER_FS:
@@ -1083,7 +1084,7 @@ static void audit_list_rules(int seq, struct sk_buff_head *q)
/* This is a blocking read, so use audit_filter_mutex instead of rcu
* iterator to sync with list writers. */
- for (i=0; i<AUDIT_NR_FILTERS; i++) {
+ for (i = 0; i < AUDIT_NR_FILTERS; i++) {
list_for_each_entry(r, &audit_rules_list[i], list) {
struct audit_rule_data *data;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fc0c7c03eeab..21d2fa815e78 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -882,7 +882,8 @@ static void audit_filter_syscall(struct task_struct *tsk,
*/
static int audit_filter_inode_name(struct task_struct *tsk,
struct audit_names *n,
- struct audit_context *ctx) {
+ struct audit_context *ctx)
+{
int h = audit_hash_ino((u32)n->ino);
struct list_head *list = &audit_inode_hash[h];
@@ -1066,7 +1067,8 @@ int audit_alloc(struct task_struct *tsk)
return 0;
}
- if (!(context = audit_alloc_context(state))) {
+ context = audit_alloc_context(state);
+ if (!context) {
kfree(key);
audit_log_lost("out of memory in audit_alloc");
return -ENOMEM;
@@ -2126,7 +2128,7 @@ retry:
d = dentry;
rcu_read_lock();
seq = read_seqbegin(&rename_lock);
- for(;;) {
+ for (;;) {
struct inode *inode = d_backing_inode(d);
if (inode && unlikely(inode->i_fsnotify_marks)) {
@@ -2458,6 +2460,8 @@ void __audit_inode_child(struct inode *parent,
}
}
+ cond_resched();
+
/* is there a matching child entry? */
list_for_each_entry(n, &context->names_list, list) {
/* can only match entries that have a name */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 249657c466dd..1095bbe29859 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -553,7 +553,7 @@ s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
return -ENOENT;
}
-static s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p)
+s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p)
{
struct btf *btf;
s32 ret;
diff --git a/kernel/capability.c b/kernel/capability.c
index 1a2795102ae4..dac4df77e376 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -112,7 +112,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
int ret;
if (pid && (pid != task_pid_vnr(current))) {
- struct task_struct *target;
+ const struct task_struct *target;
rcu_read_lock();
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 83044312bc41..c487ffef6652 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -431,7 +431,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
if (l->list[mid] == pid) {
index = mid;
break;
- } else if (l->list[mid] <= pid)
+ } else if (l->list[mid] < pid)
index = mid + 1;
else
end = mid;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 5fa95f86cb4d..1fb7f562289d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -493,28 +493,6 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
}
/**
- * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
- * @cgrp: the cgroup of interest
- * @ss: the subsystem of interest
- *
- * Find and get @cgrp's css associated with @ss. If the css doesn't exist
- * or is offline, %NULL is returned.
- */
-static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
- struct cgroup_subsys *ss)
-{
- struct cgroup_subsys_state *css;
-
- rcu_read_lock();
- css = cgroup_css(cgrp, ss);
- if (css && !css_tryget_online(css))
- css = NULL;
- rcu_read_unlock();
-
- return css;
-}
-
-/**
* cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest (%NULL returns @cgrp->self)
@@ -679,7 +657,7 @@ EXPORT_SYMBOL_GPL(of_css);
* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
* @cgrp: the target cgroup to iterate css's of
*
- * Should be called under cgroup_[tree_]mutex.
+ * Should be called under cgroup_mutex.
*/
#define for_each_css(css, ssid, cgrp) \
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
@@ -929,7 +907,7 @@ static void css_set_move_task(struct task_struct *task,
#define CSS_SET_HASH_BITS 7
static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
-static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
+static unsigned long css_set_hash(struct cgroup_subsys_state **css)
{
unsigned long key = 0UL;
struct cgroup_subsys *ss;
@@ -1070,7 +1048,7 @@ static bool compare_css_sets(struct css_set *cset,
*/
static struct css_set *find_existing_css_set(struct css_set *old_cset,
struct cgroup *cgrp,
- struct cgroup_subsys_state *template[])
+ struct cgroup_subsys_state **template)
{
struct cgroup_root *root = cgrp->root;
struct cgroup_subsys *ss;
@@ -1736,7 +1714,7 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
struct cftype *cfts, *failed_cfts;
int ret;
- if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
+ if (css->flags & CSS_VISIBLE)
return 0;
if (!css->ss) {
@@ -2499,7 +2477,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
/*
* This function may be called both before and
- * after cgroup_taskset_migrate(). The two cases
+ * after cgroup_migrate_execute(). The two cases
* can be distinguished by looking at whether @cset
* has its ->mg_dst_cset set.
*/
@@ -3654,9 +3632,32 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
return 0;
}
-static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
- struct cgroup *cgrp, int ssid)
+#ifdef CONFIG_CGROUP_SCHED
+/**
+ * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get @cgrp's css associated with @ss. If the css doesn't exist
+ * or is offline, %NULL is returned.
+ */
+static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ struct cgroup_subsys_state *css;
+
+ rcu_read_lock();
+ css = cgroup_css(cgrp, ss);
+ if (css && !css_tryget_online(css))
+ css = NULL;
+ rcu_read_unlock();
+
+ return css;
+}
+
+static int cgroup_extra_stat_show(struct seq_file *seq, int ssid)
{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
struct cgroup_subsys *ss = cgroup_subsys[ssid];
struct cgroup_subsys_state *css;
int ret;
@@ -3673,20 +3674,8 @@ static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
return ret;
}
-static int cpu_stat_show(struct seq_file *seq, void *v)
-{
- struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
- int ret = 0;
-
- cgroup_base_stat_cputime_show(seq);
-#ifdef CONFIG_CGROUP_SCHED
- ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
-#endif
- return ret;
-}
-
-static int __maybe_unused cgroup_local_stat_show(struct seq_file *seq,
- struct cgroup *cgrp, int ssid)
+static int cgroup_local_stat_show(struct seq_file *seq,
+ struct cgroup *cgrp, int ssid)
{
struct cgroup_subsys *ss = cgroup_subsys[ssid];
struct cgroup_subsys_state *css;
@@ -3703,6 +3692,18 @@ static int __maybe_unused cgroup_local_stat_show(struct seq_file *seq,
css_put(css);
return ret;
}
+#endif
+
+static int cpu_stat_show(struct seq_file *seq, void *v)
+{
+ int ret = 0;
+
+ cgroup_base_stat_cputime_show(seq);
+#ifdef CONFIG_CGROUP_SCHED
+ ret = cgroup_extra_stat_show(seq, cpu_cgrp_id);
+#endif
+ return ret;
+}
static int cpu_local_stat_show(struct seq_file *seq, void *v)
{
@@ -4350,14 +4351,13 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
return ret;
}
-static int cgroup_rm_cftypes_locked(struct cftype *cfts)
+static void cgroup_rm_cftypes_locked(struct cftype *cfts)
{
lockdep_assert_held(&cgroup_mutex);
list_del(&cfts->node);
cgroup_apply_cftypes(cfts, false);
cgroup_exit_cftypes(cfts);
- return 0;
}
/**
@@ -4373,8 +4373,6 @@ static int cgroup_rm_cftypes_locked(struct cftype *cfts)
*/
int cgroup_rm_cftypes(struct cftype *cfts)
{
- int ret;
-
if (!cfts || cfts[0].name[0] == '\0')
return 0;
@@ -4382,9 +4380,9 @@ int cgroup_rm_cftypes(struct cftype *cfts)
return -ENOENT;
cgroup_lock();
- ret = cgroup_rm_cftypes_locked(cfts);
+ cgroup_rm_cftypes_locked(cfts);
cgroup_unlock();
- return ret;
+ return 0;
}
/**
@@ -5337,7 +5335,7 @@ static struct cftype cgroup_psi_files[] = {
* RCU callback.
*
* 4. After the grace period, the css can be freed. Implemented in
- * css_free_work_fn().
+ * css_free_rwork_fn().
*
* It is actually hairier because both step 2 and 4 require process context
* and thus involve punting to css->destroy_work adding two additional
@@ -5581,8 +5579,7 @@ err_free_css:
/*
* The returned cgroup is fully initialized including its control mask, but
- * it isn't associated with its kernfs_node and doesn't have the control
- * mask applied.
+ * it doesn't have the control mask applied.
*/
static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
umode_t mode)
@@ -5908,7 +5905,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
/*
* Mark @cgrp and the associated csets dead. The former prevents
* further task migration and child creation by disabling
- * cgroup_lock_live_group(). The latter makes the csets ignored by
+ * cgroup_kn_lock_live(). The latter makes the csets ignored by
* the migration path.
*/
cgrp->self.flags &= ~CSS_ONLINE;
@@ -5930,7 +5927,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
parent->nr_threaded_children--;
spin_lock_irq(&css_set_lock);
- for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
+ for (tcgrp = parent; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
tcgrp->nr_descendants--;
tcgrp->nr_dying_descendants++;
/*
@@ -6123,8 +6120,8 @@ int __init cgroup_init(void)
continue;
if (cgroup1_ssid_disabled(ssid))
- printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
- ss->name);
+ pr_info("Disabling %s control group subsystem in v1 mounts\n",
+ ss->name);
cgrp_dfl_root.subsys_mask |= 1 << ss->id;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 58e6f18f01c1..58ec88efa4f8 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1230,7 +1230,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
/*
* Percpu kthreads in top_cpuset are ignored
*/
- if ((task->flags & PF_KTHREAD) && kthread_is_per_cpu(task))
+ if (kthread_is_per_cpu(task))
continue;
cpumask_andnot(new_cpus, possible_mask, cs->subparts_cpus);
} else {
@@ -1255,7 +1255,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
static void compute_effective_cpumask(struct cpumask *new_cpus,
struct cpuset *cs, struct cpuset *parent)
{
- if (parent->nr_subparts_cpus) {
+ if (parent->nr_subparts_cpus && is_partition_valid(cs)) {
cpumask_or(new_cpus, parent->effective_cpus,
parent->subparts_cpus);
cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
@@ -1277,6 +1277,52 @@ enum subparts_cmd {
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
int turning_on);
+static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
+ struct tmpmasks *tmp);
+
+/*
+ * Update partition exclusive flag
+ *
+ * Return: 0 if successful, an error code otherwise
+ */
+static int update_partition_exclusive(struct cpuset *cs, int new_prs)
+{
+ bool exclusive = (new_prs > 0);
+
+ if (exclusive && !is_cpu_exclusive(cs)) {
+ if (update_flag(CS_CPU_EXCLUSIVE, cs, 1))
+ return PERR_NOTEXCL;
+ } else if (!exclusive && is_cpu_exclusive(cs)) {
+ /* Turning off CS_CPU_EXCLUSIVE will not return error */
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ }
+ return 0;
+}
+
+/*
+ * Update partition load balance flag and/or rebuild sched domain
+ *
+ * Changing load balance flag will automatically call
+ * rebuild_sched_domains_locked().
+ */
+static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
+{
+ int new_prs = cs->partition_root_state;
+ bool new_lb = (new_prs != PRS_ISOLATED);
+ bool rebuild_domains = (new_prs > 0) || (old_prs > 0);
+
+ if (new_lb != !!is_sched_load_balance(cs)) {
+ rebuild_domains = true;
+ if (new_lb)
+ set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+ else
+ clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+ }
+
+ if (rebuild_domains)
+ rebuild_sched_domains_locked();
+}
+
/**
* update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
* @cs: The cpuset that requests change in partition root state
@@ -1336,8 +1382,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
return is_partition_invalid(parent)
? PERR_INVPARENT : PERR_NOTPART;
}
- if ((newmask && cpumask_empty(newmask)) ||
- (!newmask && cpumask_empty(cs->cpus_allowed)))
+ if (!newmask && cpumask_empty(cs->cpus_allowed))
return PERR_CPUSEMPTY;
/*
@@ -1404,10 +1449,15 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
adding = cpumask_andnot(tmp->addmask, tmp->addmask,
parent->subparts_cpus);
/*
+ * Empty cpumask is not allowed
+ */
+ if (cpumask_empty(newmask)) {
+ part_error = PERR_CPUSEMPTY;
+ /*
* Make partition invalid if parent's effective_cpus could
* become empty and there are tasks in the parent.
*/
- if (adding &&
+ } else if (adding &&
cpumask_subset(parent->effective_cpus, tmp->addmask) &&
!cpumask_intersects(tmp->delmask, cpu_active_mask) &&
partition_is_populated(parent, cs)) {
@@ -1480,14 +1530,13 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
/*
* Transitioning between invalid to valid or vice versa may require
- * changing CS_CPU_EXCLUSIVE and CS_SCHED_LOAD_BALANCE.
+ * changing CS_CPU_EXCLUSIVE.
*/
if (old_prs != new_prs) {
- if (is_prs_invalid(old_prs) && !is_cpu_exclusive(cs) &&
- (update_flag(CS_CPU_EXCLUSIVE, cs, 1) < 0))
- return PERR_NOTEXCL;
- if (is_prs_invalid(new_prs) && is_cpu_exclusive(cs))
- update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ int err = update_partition_exclusive(cs, new_prs);
+
+ if (err)
+ return err;
}
/*
@@ -1520,24 +1569,34 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
spin_unlock_irq(&callback_lock);
- if (adding || deleting)
+ if (adding || deleting) {
update_tasks_cpumask(parent, tmp->addmask);
+ if (parent->child_ecpus_count)
+ update_sibling_cpumasks(parent, cs, tmp);
+ }
/*
- * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary.
- * rebuild_sched_domains_locked() may be called.
+ * For partcmd_update without newmask, it is being called from
+ * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken.
+ * Update the load balance flag and scheduling domain if
+ * cpus_read_trylock() is successful.
*/
- if (old_prs != new_prs) {
- if (old_prs == PRS_ISOLATED)
- update_flag(CS_SCHED_LOAD_BALANCE, cs, 1);
- else if (new_prs == PRS_ISOLATED)
- update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+ if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) {
+ update_partition_sd_lb(cs, old_prs);
+ cpus_read_unlock();
}
+
notify_partition_change(cs, old_prs);
return 0;
}
/*
+ * update_cpumasks_hier() flags
+ */
+#define HIER_CHECKALL 0x01 /* Check all cpusets with no skipping */
+#define HIER_NO_SD_REBUILD 0x02 /* Don't rebuild sched domains */
+
+/*
* update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
* @cs: the cpuset to consider
* @tmp: temp variables for calculating effective_cpus & partition setup
@@ -1551,7 +1610,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
* Called with cpuset_mutex held
*/
static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
- bool force)
+ int flags)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
@@ -1588,11 +1647,16 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
}
/*
- * Skip the whole subtree if the cpumask remains the same
- * and has no partition root state and force flag not set.
+ * Skip the whole subtree if
+ * 1) the cpumask remains the same,
+ * 2) has no partition root state,
+ * 3) HIER_CHECKALL flag not set, and
+ * 4) for v2 load balance state same as its parent.
*/
- if (!cp->partition_root_state && !force &&
- cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
+ if (!cp->partition_root_state && !(flags & HIER_CHECKALL) &&
+ cpumask_equal(tmp->new_cpus, cp->effective_cpus) &&
+ (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+ (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
@@ -1676,6 +1740,20 @@ update_parent_subparts:
update_tasks_cpumask(cp, tmp->new_cpus);
/*
+ * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE
+ * from parent if current cpuset isn't a valid partition root
+ * and their load balance states differ.
+ */
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ !is_partition_valid(cp) &&
+ (is_sched_load_balance(parent) != is_sched_load_balance(cp))) {
+ if (is_sched_load_balance(parent))
+ set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
+ else
+ clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
+ }
+
+ /*
* On legacy hierarchy, if the effective cpumask of any non-
* empty cpuset is changed, we need to rebuild sched domains.
* On default hierarchy, the cpuset needs to be a partition
@@ -1692,7 +1770,7 @@ update_parent_subparts:
}
rcu_read_unlock();
- if (need_rebuild_sched_domains)
+ if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD))
rebuild_sched_domains_locked();
}
@@ -1716,7 +1794,9 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
* to use the right effective_cpus value.
*
* The update_cpumasks_hier() function may sleep. So we have to
- * release the RCU read lock before calling it.
+ * release the RCU read lock before calling it. HIER_NO_SD_REBUILD
+ * flag is used to suppress rebuild of sched domains as the callers
+ * will take care of that.
*/
rcu_read_lock();
cpuset_for_each_child(sibling, pos_css, parent) {
@@ -1728,7 +1808,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
continue;
rcu_read_unlock();
- update_cpumasks_hier(sibling, tmp, false);
+ update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD);
rcu_read_lock();
css_put(&sibling->css);
}
@@ -1747,6 +1827,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
int retval;
struct tmpmasks tmp;
bool invalidate = false;
+ int old_prs = cs->partition_root_state;
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == &top_cpuset)
@@ -1774,18 +1855,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
return 0;
-#ifdef CONFIG_CPUMASK_OFFSTACK
- /*
- * Use the cpumasks in trialcs for tmpmasks when they are pointers
- * to allocated cpumasks.
- *
- * Note that update_parent_subparts_cpumask() uses only addmask &
- * delmask, but not new_cpus.
- */
- tmp.addmask = trialcs->subparts_cpus;
- tmp.delmask = trialcs->effective_cpus;
- tmp.new_cpus = NULL;
-#endif
+ if (alloc_cpumasks(NULL, &tmp))
+ return -ENOMEM;
retval = validate_change(cs, trialcs);
@@ -1814,7 +1885,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
retval = 0;
}
if (retval < 0)
- return retval;
+ goto out_free;
if (cs->partition_root_state) {
if (invalidate)
@@ -1849,13 +1920,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
}
spin_unlock_irq(&callback_lock);
-#ifdef CONFIG_CPUMASK_OFFSTACK
- /* Now trialcs->cpus_allowed is available */
- tmp.new_cpus = trialcs->cpus_allowed;
-#endif
-
/* effective_cpus will be updated here */
- update_cpumasks_hier(cs, &tmp, false);
+ update_cpumasks_hier(cs, &tmp, 0);
if (cs->partition_root_state) {
struct cpuset *parent = parent_cs(cs);
@@ -1866,7 +1932,12 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
*/
if (parent->child_ecpus_count)
update_sibling_cpumasks(parent, cs, &tmp);
+
+ /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains */
+ update_partition_sd_lb(cs, old_prs);
}
+out_free:
+ free_cpumasks(NULL, &tmp);
return 0;
}
@@ -2242,7 +2313,6 @@ out:
static int update_prstate(struct cpuset *cs, int new_prs)
{
int err = PERR_NONE, old_prs = cs->partition_root_state;
- bool sched_domain_rebuilt = false;
struct cpuset *parent = parent_cs(cs);
struct tmpmasks tmpmask;
@@ -2261,45 +2331,26 @@ static int update_prstate(struct cpuset *cs, int new_prs)
if (alloc_cpumasks(NULL, &tmpmask))
return -ENOMEM;
+ err = update_partition_exclusive(cs, new_prs);
+ if (err)
+ goto out;
+
if (!old_prs) {
/*
- * Turning on partition root requires setting the
- * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
- * cannot be empty.
+ * cpus_allowed cannot be empty.
*/
if (cpumask_empty(cs->cpus_allowed)) {
err = PERR_CPUSEMPTY;
goto out;
}
- err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
- if (err) {
- err = PERR_NOTEXCL;
- goto out;
- }
-
err = update_parent_subparts_cpumask(cs, partcmd_enable,
NULL, &tmpmask);
- if (err) {
- update_flag(CS_CPU_EXCLUSIVE, cs, 0);
- goto out;
- }
-
- if (new_prs == PRS_ISOLATED) {
- /*
- * Disable the load balance flag should not return an
- * error unless the system is running out of memory.
- */
- update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
- sched_domain_rebuilt = true;
- }
} else if (old_prs && new_prs) {
/*
* A change in load balance state only, no change in cpumasks.
*/
- update_flag(CS_SCHED_LOAD_BALANCE, cs, (new_prs != PRS_ISOLATED));
- sched_domain_rebuilt = true;
- goto out; /* Sched domain is rebuilt in update_flag() */
+ ;
} else {
/*
* Switching back to member is always allowed even if it
@@ -2318,40 +2369,31 @@ static int update_prstate(struct cpuset *cs, int new_prs)
compute_effective_cpumask(cs->effective_cpus, cs, parent);
spin_unlock_irq(&callback_lock);
}
-
- /* Turning off CS_CPU_EXCLUSIVE will not return error */
- update_flag(CS_CPU_EXCLUSIVE, cs, 0);
-
- if (!is_sched_load_balance(cs)) {
- /* Make sure load balance is on */
- update_flag(CS_SCHED_LOAD_BALANCE, cs, 1);
- sched_domain_rebuilt = true;
- }
}
-
- update_tasks_cpumask(parent, tmpmask.new_cpus);
-
- if (parent->child_ecpus_count)
- update_sibling_cpumasks(parent, cs, &tmpmask);
-
- if (!sched_domain_rebuilt)
- rebuild_sched_domains_locked();
out:
/*
- * Make partition invalid if an error happen
+ * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error
+ * happens.
*/
- if (err)
+ if (err) {
new_prs = -new_prs;
+ update_partition_exclusive(cs, new_prs);
+ }
+
spin_lock_irq(&callback_lock);
cs->partition_root_state = new_prs;
WRITE_ONCE(cs->prs_err, err);
spin_unlock_irq(&callback_lock);
+
/*
* Update child cpusets, if present.
* Force update if switching back to member.
*/
if (!list_empty(&cs->css.children))
- update_cpumasks_hier(cs, &tmpmask, !new_prs);
+ update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0);
+
+ /* Update sched domains and load balance flag */
+ update_partition_sd_lb(cs, old_prs);
notify_partition_change(cs, old_prs);
free_cpumasks(NULL, &tmpmask);
@@ -2487,6 +2529,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
struct cgroup_subsys_state *css;
struct cpuset *cs, *oldcs;
struct task_struct *task;
+ bool cpus_updated, mems_updated;
int ret;
/* used later by cpuset_attach() */
@@ -2501,13 +2544,25 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
if (ret)
goto out_unlock;
+ cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
+ mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
+
cgroup_taskset_for_each(task, css, tset) {
ret = task_can_attach(task);
if (ret)
goto out_unlock;
- ret = security_task_setscheduler(task);
- if (ret)
- goto out_unlock;
+
+ /*
+ * Skip rights over task check in v2 when nothing changes,
+ * migration permission derives from hierarchy ownership in
+ * cgroup_procs_write_permission()).
+ */
+ if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+ (cpus_updated || mems_updated)) {
+ ret = security_task_setscheduler(task);
+ if (ret)
+ goto out_unlock;
+ }
if (dl_task(task)) {
cs->nr_migrate_dl_tasks++;
@@ -3222,6 +3277,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cs->use_parent_ecpus = true;
parent->child_ecpus_count++;
}
+
+ /*
+ * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
+ */
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ !is_sched_load_balance(parent))
+ clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+
spin_unlock_irq(&callback_lock);
if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
@@ -3521,17 +3584,16 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
is_empty = cpumask_empty(cs->cpus_allowed) ||
nodes_empty(cs->mems_allowed);
- mutex_unlock(&cpuset_mutex);
-
/*
* Move tasks to the nearest ancestor with execution resources,
* This is full cgroup operation which will also call back into
* cpuset. Should be done outside any lock.
*/
- if (is_empty)
+ if (is_empty) {
+ mutex_unlock(&cpuset_mutex);
remove_tasks_in_empty_cpuset(cs);
-
- mutex_lock(&cpuset_mutex);
+ mutex_lock(&cpuset_mutex);
+ }
}
static void
@@ -3691,6 +3753,7 @@ unlock:
/**
* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
+ * @work: unused
*
* This function is called after either CPU or memory configuration has
* changed and updates cpuset accordingly. The top_cpuset is always
@@ -4073,6 +4136,7 @@ bool cpuset_node_allowed(int node, gfp_t gfp_mask)
/**
* cpuset_spread_node() - On which node to begin search for a page
+ * @rotor: round robin rotor
*
* If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
* tasks in a cpuset with is_spread_page or is_spread_slab set),
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
index ae2f4dd47508..79a3717a5803 100644
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -14,7 +14,7 @@
#include <linux/misc_cgroup.h>
#define MAX_STR "max"
-#define MAX_NUM ULONG_MAX
+#define MAX_NUM U64_MAX
/* Miscellaneous res name, keep it in sync with enum misc_res_type */
static const char *const misc_res_name[] = {
@@ -37,7 +37,7 @@ static struct misc_cg root_cg;
* more than the actual capacity. We are using Limits resource distribution
* model of cgroup for miscellaneous controller.
*/
-static unsigned long misc_res_capacity[MISC_CG_RES_TYPES];
+static u64 misc_res_capacity[MISC_CG_RES_TYPES];
/**
* parent_misc() - Get the parent of the passed misc cgroup.
@@ -74,10 +74,10 @@ static inline bool valid_type(enum misc_res_type type)
* Context: Any context.
* Return: Current total usage of the resource.
*/
-unsigned long misc_cg_res_total_usage(enum misc_res_type type)
+u64 misc_cg_res_total_usage(enum misc_res_type type)
{
if (valid_type(type))
- return atomic_long_read(&root_cg.res[type].usage);
+ return atomic64_read(&root_cg.res[type].usage);
return 0;
}
@@ -95,7 +95,7 @@ EXPORT_SYMBOL_GPL(misc_cg_res_total_usage);
* * %0 - Successfully registered the capacity.
* * %-EINVAL - If @type is invalid.
*/
-int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity)
+int misc_cg_set_capacity(enum misc_res_type type, u64 capacity)
{
if (!valid_type(type))
return -EINVAL;
@@ -114,9 +114,9 @@ EXPORT_SYMBOL_GPL(misc_cg_set_capacity);
* Context: Any context.
*/
static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg,
- unsigned long amount)
+ u64 amount)
{
- WARN_ONCE(atomic_long_add_negative(-amount, &cg->res[type].usage),
+ WARN_ONCE(atomic64_add_negative(-amount, &cg->res[type].usage),
"misc cgroup resource %s became less than 0",
misc_res_name[type]);
}
@@ -137,13 +137,12 @@ static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg,
* * -EBUSY - If max limit will be crossed or total usage will be more than the
* capacity.
*/
-int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
- unsigned long amount)
+int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount)
{
struct misc_cg *i, *j;
int ret;
struct misc_res *res;
- int new_usage;
+ u64 new_usage;
if (!(valid_type(type) && cg && READ_ONCE(misc_res_capacity[type])))
return -EINVAL;
@@ -154,7 +153,7 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
for (i = cg; i; i = parent_misc(i)) {
res = &i->res[type];
- new_usage = atomic_long_add_return(amount, &res->usage);
+ new_usage = atomic64_add_return(amount, &res->usage);
if (new_usage > READ_ONCE(res->max) ||
new_usage > READ_ONCE(misc_res_capacity[type])) {
ret = -EBUSY;
@@ -165,7 +164,7 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
err_charge:
for (j = i; j; j = parent_misc(j)) {
- atomic_long_inc(&j->res[type].events);
+ atomic64_inc(&j->res[type].events);
cgroup_file_notify(&j->events_file);
}
@@ -184,8 +183,7 @@ EXPORT_SYMBOL_GPL(misc_cg_try_charge);
*
* Context: Any context.
*/
-void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg,
- unsigned long amount)
+void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount)
{
struct misc_cg *i;
@@ -209,7 +207,7 @@ static int misc_cg_max_show(struct seq_file *sf, void *v)
{
int i;
struct misc_cg *cg = css_misc(seq_css(sf));
- unsigned long max;
+ u64 max;
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
if (READ_ONCE(misc_res_capacity[i])) {
@@ -217,7 +215,7 @@ static int misc_cg_max_show(struct seq_file *sf, void *v)
if (max == MAX_NUM)
seq_printf(sf, "%s max\n", misc_res_name[i]);
else
- seq_printf(sf, "%s %lu\n", misc_res_name[i],
+ seq_printf(sf, "%s %llu\n", misc_res_name[i],
max);
}
}
@@ -241,13 +239,13 @@ static int misc_cg_max_show(struct seq_file *sf, void *v)
* Return:
* * >= 0 - Number of bytes processed in the input.
* * -EINVAL - If buf is not valid.
- * * -ERANGE - If number is bigger than the unsigned long capacity.
+ * * -ERANGE - If number is bigger than the u64 capacity.
*/
static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct misc_cg *cg;
- unsigned long max;
+ u64 max;
int ret = 0, i;
enum misc_res_type type = MISC_CG_RES_TYPES;
char *token;
@@ -271,7 +269,7 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
if (!strcmp(MAX_STR, buf)) {
max = MAX_NUM;
} else {
- ret = kstrtoul(buf, 0, &max);
+ ret = kstrtou64(buf, 0, &max);
if (ret)
return ret;
}
@@ -297,13 +295,13 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
static int misc_cg_current_show(struct seq_file *sf, void *v)
{
int i;
- unsigned long usage;
+ u64 usage;
struct misc_cg *cg = css_misc(seq_css(sf));
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
- usage = atomic_long_read(&cg->res[i].usage);
+ usage = atomic64_read(&cg->res[i].usage);
if (READ_ONCE(misc_res_capacity[i]) || usage)
- seq_printf(sf, "%s %lu\n", misc_res_name[i], usage);
+ seq_printf(sf, "%s %llu\n", misc_res_name[i], usage);
}
return 0;
@@ -322,12 +320,12 @@ static int misc_cg_current_show(struct seq_file *sf, void *v)
static int misc_cg_capacity_show(struct seq_file *sf, void *v)
{
int i;
- unsigned long cap;
+ u64 cap;
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
cap = READ_ONCE(misc_res_capacity[i]);
if (cap)
- seq_printf(sf, "%s %lu\n", misc_res_name[i], cap);
+ seq_printf(sf, "%s %llu\n", misc_res_name[i], cap);
}
return 0;
@@ -336,12 +334,13 @@ static int misc_cg_capacity_show(struct seq_file *sf, void *v)
static int misc_events_show(struct seq_file *sf, void *v)
{
struct misc_cg *cg = css_misc(seq_css(sf));
- unsigned long events, i;
+ u64 events;
+ int i;
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
- events = atomic_long_read(&cg->res[i].events);
+ events = atomic64_read(&cg->res[i].events);
if (READ_ONCE(misc_res_capacity[i]) || events)
- seq_printf(sf, "%s.max %lu\n", misc_res_name[i], events);
+ seq_printf(sf, "%s.max %llu\n", misc_res_name[i], events);
}
return 0;
}
@@ -397,7 +396,7 @@ misc_cg_alloc(struct cgroup_subsys_state *parent_css)
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
WRITE_ONCE(cg->res[i].max, MAX_NUM);
- atomic_long_set(&cg->res[i].usage, 0);
+ atomic64_set(&cg->res[i].usage, 0);
}
return &cg->css;
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index 0d5c29879a50..144a464e45c6 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -149,9 +149,3 @@ const struct proc_ns_operations cgroupns_operations = {
.install = cgroupns_install,
.owner = cgroupns_owner,
};
-
-static __init int cgroup_namespaces_init(void)
-{
- return 0;
-}
-subsys_initcall(cgroup_namespaces_init);
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 2542c21b6b6d..d80d7a608141 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -344,6 +344,7 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup_rstat_cpu *prstatc;
struct cgroup_base_stat delta;
unsigned seq;
@@ -357,17 +358,24 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
delta = rstatc->bstat;
} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
- /* propagate percpu delta to global */
+ /* propagate per-cpu delta to cgroup and per-cpu global statistics */
cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
cgroup_base_stat_add(&cgrp->bstat, &delta);
cgroup_base_stat_add(&rstatc->last_bstat, &delta);
+ cgroup_base_stat_add(&rstatc->subtree_bstat, &delta);
- /* propagate global delta to parent (unless that's root) */
+ /* propagate cgroup and per-cpu global delta to parent (unless that's root) */
if (cgroup_parent(parent)) {
delta = cgrp->bstat;
cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
cgroup_base_stat_add(&parent->bstat, &delta);
cgroup_base_stat_add(&cgrp->last_bstat, &delta);
+
+ delta = rstatc->subtree_bstat;
+ prstatc = cgroup_rstat_cpu(parent, cpu);
+ cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat);
+ cgroup_base_stat_add(&prstatc->subtree_bstat, &delta);
+ cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta);
}
}
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
index e8db8d938661..4722b998a324 100644
--- a/kernel/configs/debug.config
+++ b/kernel/configs/debug.config
@@ -1,3 +1,5 @@
+# Help: Debugging for CI systems and finding regressions
+#
# The config is based on running daily CI for enterprise Linux distros to
# seek regressions on linux-next builds on different bare-metal and virtual
# platforms. It can be used for example,
diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config
index 208481d91090..d0877063d925 100644
--- a/kernel/configs/kvm_guest.config
+++ b/kernel/configs/kvm_guest.config
@@ -1,3 +1,4 @@
+# Help: Bootable as a KVM guest
CONFIG_NET=y
CONFIG_NET_CORE=y
CONFIG_NETDEVICES=y
diff --git a/kernel/configs/nopm.config b/kernel/configs/nopm.config
index 81ff07863576..ebfdc3d8aa9a 100644
--- a/kernel/configs/nopm.config
+++ b/kernel/configs/nopm.config
@@ -1,3 +1,5 @@
+# Help: Disable Power Management
+
CONFIG_PM=n
CONFIG_SUSPEND=n
CONFIG_HIBERNATION=n
diff --git a/kernel/configs/rust.config b/kernel/configs/rust.config
index 38a7c5362c9c..2c6e001a7284 100644
--- a/kernel/configs/rust.config
+++ b/kernel/configs/rust.config
@@ -1 +1,2 @@
+# Help: Enable Rust
CONFIG_RUST=y
diff --git a/kernel/configs/tiny-base.config b/kernel/configs/tiny-base.config
index 2f0e6bf6db2c..ffb9dcafca26 100644
--- a/kernel/configs/tiny-base.config
+++ b/kernel/configs/tiny-base.config
@@ -1 +1 @@
-CONFIG_EMBEDDED=y
+CONFIG_EXPERT=y
diff --git a/kernel/configs/x86_debug.config b/kernel/configs/x86_debug.config
index 6fac5b405334..35f48671b8d5 100644
--- a/kernel/configs/x86_debug.config
+++ b/kernel/configs/x86_debug.config
@@ -1,3 +1,4 @@
+# Help: Debugging options for tip tree testing
CONFIG_X86_DEBUG_FPU=y
CONFIG_LOCK_STAT=y
CONFIG_DEBUG_VM=y
diff --git a/kernel/configs/xen.config b/kernel/configs/xen.config
index 436f806aa1ed..6878b9a49be8 100644
--- a/kernel/configs/xen.config
+++ b/kernel/configs/xen.config
@@ -1,3 +1,5 @@
+# Help: Bootable as a Xen guest
+#
# global stuff - these enable us to allow some
# of the not so generic stuff below for xen
CONFIG_PARAVIRT=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f6811c857102..6de7c6bb74ee 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1487,8 +1487,22 @@ out:
return ret;
}
+struct cpu_down_work {
+ unsigned int cpu;
+ enum cpuhp_state target;
+};
+
+static long __cpu_down_maps_locked(void *arg)
+{
+ struct cpu_down_work *work = arg;
+
+ return _cpu_down(work->cpu, 0, work->target);
+}
+
static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
{
+ struct cpu_down_work work = { .cpu = cpu, .target = target, };
+
/*
* If the platform does not support hotplug, report it explicitly to
* differentiate it from a transient offlining failure.
@@ -1497,7 +1511,15 @@ static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
return -EOPNOTSUPP;
if (cpu_hotplug_disabled)
return -EBUSY;
- return _cpu_down(cpu, 0, target);
+
+ /*
+ * Ensure that the control task does not run on the to be offlined
+ * CPU to prevent a deadlock against cfs_b->period_timer.
+ */
+ cpu = cpumask_any_but(cpu_online_mask, cpu);
+ if (cpu >= nr_cpu_ids)
+ return -EBUSY;
+ return work_on_cpu(cpu, __cpu_down_maps_locked, &work);
}
static int cpu_down(unsigned int cpu, enum cpuhp_state target)
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 90ce1dfd591c..03a7932cde0a 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -10,6 +10,9 @@
#include <linux/utsname.h>
#include <linux/vmalloc.h>
#include <linux/sizes.h>
+#include <linux/kexec.h>
+#include <linux/memory.h>
+#include <linux/cpuhotplug.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -17,6 +20,10 @@
#include <crypto/sha1.h>
#include "kallsyms_internal.h"
+#include "kexec_internal.h"
+
+/* Per cpu memory for storing cpu states in case of system crash. */
+note_buf_t __percpu *crash_notes;
/* vmcoreinfo stuff */
unsigned char *vmcoreinfo_data;
@@ -314,6 +321,187 @@ static int __init parse_crashkernel_dummy(char *arg)
}
early_param("crashkernel", parse_crashkernel_dummy);
+int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
+ void **addr, unsigned long *sz)
+{
+ Elf64_Ehdr *ehdr;
+ Elf64_Phdr *phdr;
+ unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz;
+ unsigned char *buf;
+ unsigned int cpu, i;
+ unsigned long long notes_addr;
+ unsigned long mstart, mend;
+
+ /* extra phdr for vmcoreinfo ELF note */
+ nr_phdr = nr_cpus + 1;
+ nr_phdr += mem->nr_ranges;
+
+ /*
+ * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
+ * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64).
+ * I think this is required by tools like gdb. So same physical
+ * memory will be mapped in two ELF headers. One will contain kernel
+ * text virtual addresses and other will have __va(physical) addresses.
+ */
+
+ nr_phdr++;
+ elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr);
+ elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN);
+
+ buf = vzalloc(elf_sz);
+ if (!buf)
+ return -ENOMEM;
+
+ ehdr = (Elf64_Ehdr *)buf;
+ phdr = (Elf64_Phdr *)(ehdr + 1);
+ memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
+ ehdr->e_ident[EI_CLASS] = ELFCLASS64;
+ ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
+ ehdr->e_ident[EI_VERSION] = EV_CURRENT;
+ ehdr->e_ident[EI_OSABI] = ELF_OSABI;
+ memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD);
+ ehdr->e_type = ET_CORE;
+ ehdr->e_machine = ELF_ARCH;
+ ehdr->e_version = EV_CURRENT;
+ ehdr->e_phoff = sizeof(Elf64_Ehdr);
+ ehdr->e_ehsize = sizeof(Elf64_Ehdr);
+ ehdr->e_phentsize = sizeof(Elf64_Phdr);
+
+ /* Prepare one phdr of type PT_NOTE for each possible CPU */
+ for_each_possible_cpu(cpu) {
+ phdr->p_type = PT_NOTE;
+ notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
+ phdr->p_offset = phdr->p_paddr = notes_addr;
+ phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t);
+ (ehdr->e_phnum)++;
+ phdr++;
+ }
+
+ /* Prepare one PT_NOTE header for vmcoreinfo */
+ phdr->p_type = PT_NOTE;
+ phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
+ phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
+ (ehdr->e_phnum)++;
+ phdr++;
+
+ /* Prepare PT_LOAD type program header for kernel text region */
+ if (need_kernel_map) {
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_vaddr = (unsigned long) _text;
+ phdr->p_filesz = phdr->p_memsz = _end - _text;
+ phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
+ ehdr->e_phnum++;
+ phdr++;
+ }
+
+ /* Go through all the ranges in mem->ranges[] and prepare phdr */
+ for (i = 0; i < mem->nr_ranges; i++) {
+ mstart = mem->ranges[i].start;
+ mend = mem->ranges[i].end;
+
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_offset = mstart;
+
+ phdr->p_paddr = mstart;
+ phdr->p_vaddr = (unsigned long) __va(mstart);
+ phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
+ phdr->p_align = 0;
+ ehdr->e_phnum++;
+ pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
+ phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
+ ehdr->e_phnum, phdr->p_offset);
+ phdr++;
+ }
+
+ *addr = buf;
+ *sz = elf_sz;
+ return 0;
+}
+
+int crash_exclude_mem_range(struct crash_mem *mem,
+ unsigned long long mstart, unsigned long long mend)
+{
+ int i, j;
+ unsigned long long start, end, p_start, p_end;
+ struct range temp_range = {0, 0};
+
+ for (i = 0; i < mem->nr_ranges; i++) {
+ start = mem->ranges[i].start;
+ end = mem->ranges[i].end;
+ p_start = mstart;
+ p_end = mend;
+
+ if (mstart > end || mend < start)
+ continue;
+
+ /* Truncate any area outside of range */
+ if (mstart < start)
+ p_start = start;
+ if (mend > end)
+ p_end = end;
+
+ /* Found completely overlapping range */
+ if (p_start == start && p_end == end) {
+ mem->ranges[i].start = 0;
+ mem->ranges[i].end = 0;
+ if (i < mem->nr_ranges - 1) {
+ /* Shift rest of the ranges to left */
+ for (j = i; j < mem->nr_ranges - 1; j++) {
+ mem->ranges[j].start =
+ mem->ranges[j+1].start;
+ mem->ranges[j].end =
+ mem->ranges[j+1].end;
+ }
+
+ /*
+ * Continue to check if there are another overlapping ranges
+ * from the current position because of shifting the above
+ * mem ranges.
+ */
+ i--;
+ mem->nr_ranges--;
+ continue;
+ }
+ mem->nr_ranges--;
+ return 0;
+ }
+
+ if (p_start > start && p_end < end) {
+ /* Split original range */
+ mem->ranges[i].end = p_start - 1;
+ temp_range.start = p_end + 1;
+ temp_range.end = end;
+ } else if (p_start != start)
+ mem->ranges[i].end = p_start - 1;
+ else
+ mem->ranges[i].start = p_end + 1;
+ break;
+ }
+
+ /* If a split happened, add the split to array */
+ if (!temp_range.end)
+ return 0;
+
+ /* Split happened */
+ if (i == mem->max_nr_ranges - 1)
+ return -ENOMEM;
+
+ /* Location where new range should go */
+ j = i + 1;
+ if (j < mem->nr_ranges) {
+ /* Move over all ranges one slot towards the end */
+ for (i = mem->nr_ranges - 1; i >= j; i--)
+ mem->ranges[i + 1] = mem->ranges[i];
+ }
+
+ mem->ranges[j].start = temp_range.start;
+ mem->ranges[j].end = temp_range.end;
+ mem->nr_ranges++;
+ return 0;
+}
+
Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
void *data, size_t data_len)
{
@@ -455,8 +643,6 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(page, lru);
VMCOREINFO_OFFSET(page, _mapcount);
VMCOREINFO_OFFSET(page, private);
- VMCOREINFO_OFFSET(folio, _folio_dtor);
- VMCOREINFO_OFFSET(folio, _folio_order);
VMCOREINFO_OFFSET(page, compound_head);
VMCOREINFO_OFFSET(pglist_data, node_zones);
VMCOREINFO_OFFSET(pglist_data, nr_zones);
@@ -490,7 +676,7 @@ static int __init crash_save_vmcoreinfo_init(void)
#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy)
VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
#ifdef CONFIG_HUGETLB_PAGE
- VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
+ VMCOREINFO_NUMBER(PG_hugetlb);
#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline)
VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
#endif
@@ -515,3 +701,206 @@ static int __init crash_save_vmcoreinfo_init(void)
}
subsys_initcall(crash_save_vmcoreinfo_init);
+
+static int __init crash_notes_memory_init(void)
+{
+ /* Allocate memory for saving cpu registers. */
+ size_t size, align;
+
+ /*
+ * crash_notes could be allocated across 2 vmalloc pages when percpu
+ * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc
+ * pages are also on 2 continuous physical pages. In this case the
+ * 2nd part of crash_notes in 2nd page could be lost since only the
+ * starting address and size of crash_notes are exported through sysfs.
+ * Here round up the size of crash_notes to the nearest power of two
+ * and pass it to __alloc_percpu as align value. This can make sure
+ * crash_notes is allocated inside one physical page.
+ */
+ size = sizeof(note_buf_t);
+ align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE);
+
+ /*
+ * Break compile if size is bigger than PAGE_SIZE since crash_notes
+ * definitely will be in 2 pages with that.
+ */
+ BUILD_BUG_ON(size > PAGE_SIZE);
+
+ crash_notes = __alloc_percpu(size, align);
+ if (!crash_notes) {
+ pr_warn("Memory allocation for saving cpu register states failed\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+subsys_initcall(crash_notes_memory_init);
+
+#ifdef CONFIG_CRASH_HOTPLUG
+#undef pr_fmt
+#define pr_fmt(fmt) "crash hp: " fmt
+
+/*
+ * This routine utilized when the crash_hotplug sysfs node is read.
+ * It reflects the kernel's ability/permission to update the crash
+ * elfcorehdr directly.
+ */
+int crash_check_update_elfcorehdr(void)
+{
+ int rc = 0;
+
+ /* Obtain lock while reading crash information */
+ if (!kexec_trylock()) {
+ pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n");
+ return 0;
+ }
+ if (kexec_crash_image) {
+ if (kexec_crash_image->file_mode)
+ rc = 1;
+ else
+ rc = kexec_crash_image->update_elfcorehdr;
+ }
+ /* Release lock now that update complete */
+ kexec_unlock();
+
+ return rc;
+}
+
+/*
+ * To accurately reflect hot un/plug changes of cpu and memory resources
+ * (including onling and offlining of those resources), the elfcorehdr
+ * (which is passed to the crash kernel via the elfcorehdr= parameter)
+ * must be updated with the new list of CPUs and memories.
+ *
+ * In order to make changes to elfcorehdr, two conditions are needed:
+ * First, the segment containing the elfcorehdr must be large enough
+ * to permit a growing number of resources; the elfcorehdr memory size
+ * is based on NR_CPUS_DEFAULT and CRASH_MAX_MEMORY_RANGES.
+ * Second, purgatory must explicitly exclude the elfcorehdr from the
+ * list of segments it checks (since the elfcorehdr changes and thus
+ * would require an update to purgatory itself to update the digest).
+ */
+static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu)
+{
+ struct kimage *image;
+
+ /* Obtain lock while changing crash information */
+ if (!kexec_trylock()) {
+ pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n");
+ return;
+ }
+
+ /* Check kdump is not loaded */
+ if (!kexec_crash_image)
+ goto out;
+
+ image = kexec_crash_image;
+
+ /* Check that updating elfcorehdr is permitted */
+ if (!(image->file_mode || image->update_elfcorehdr))
+ goto out;
+
+ if (hp_action == KEXEC_CRASH_HP_ADD_CPU ||
+ hp_action == KEXEC_CRASH_HP_REMOVE_CPU)
+ pr_debug("hp_action %u, cpu %u\n", hp_action, cpu);
+ else
+ pr_debug("hp_action %u\n", hp_action);
+
+ /*
+ * The elfcorehdr_index is set to -1 when the struct kimage
+ * is allocated. Find the segment containing the elfcorehdr,
+ * if not already found.
+ */
+ if (image->elfcorehdr_index < 0) {
+ unsigned long mem;
+ unsigned char *ptr;
+ unsigned int n;
+
+ for (n = 0; n < image->nr_segments; n++) {
+ mem = image->segment[n].mem;
+ ptr = kmap_local_page(pfn_to_page(mem >> PAGE_SHIFT));
+ if (ptr) {
+ /* The segment containing elfcorehdr */
+ if (memcmp(ptr, ELFMAG, SELFMAG) == 0)
+ image->elfcorehdr_index = (int)n;
+ kunmap_local(ptr);
+ }
+ }
+ }
+
+ if (image->elfcorehdr_index < 0) {
+ pr_err("unable to locate elfcorehdr segment");
+ goto out;
+ }
+
+ /* Needed in order for the segments to be updated */
+ arch_kexec_unprotect_crashkres();
+
+ /* Differentiate between normal load and hotplug update */
+ image->hp_action = hp_action;
+
+ /* Now invoke arch-specific update handler */
+ arch_crash_handle_hotplug_event(image);
+
+ /* No longer handling a hotplug event */
+ image->hp_action = KEXEC_CRASH_HP_NONE;
+ image->elfcorehdr_updated = true;
+
+ /* Change back to read-only */
+ arch_kexec_protect_crashkres();
+
+ /* Errors in the callback is not a reason to rollback state */
+out:
+ /* Release lock now that update complete */
+ kexec_unlock();
+}
+
+static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v)
+{
+ switch (val) {
+ case MEM_ONLINE:
+ crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_MEMORY,
+ KEXEC_CRASH_HP_INVALID_CPU);
+ break;
+
+ case MEM_OFFLINE:
+ crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_MEMORY,
+ KEXEC_CRASH_HP_INVALID_CPU);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block crash_memhp_nb = {
+ .notifier_call = crash_memhp_notifier,
+ .priority = 0
+};
+
+static int crash_cpuhp_online(unsigned int cpu)
+{
+ crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_CPU, cpu);
+ return 0;
+}
+
+static int crash_cpuhp_offline(unsigned int cpu)
+{
+ crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_CPU, cpu);
+ return 0;
+}
+
+static int __init crash_hotplug_init(void)
+{
+ int result = 0;
+
+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG))
+ register_memory_notifier(&crash_memhp_nb);
+
+ if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
+ result = cpuhp_setup_state_nocalls(CPUHP_BP_PREPARE_DYN,
+ "crash/cpuhp", crash_cpuhp_online, crash_cpuhp_offline);
+ }
+
+ return result;
+}
+
+subsys_initcall(crash_hotplug_init);
+#endif
diff --git a/kernel/cred.c b/kernel/cred.c
index 811ad654abd1..98cb4eca23fb 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -4,6 +4,9 @@
* Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
+
+#define pr_fmt(fmt) "CRED: " fmt
+
#include <linux/export.h>
#include <linux/cred.h>
#include <linux/slab.h>
@@ -835,32 +838,32 @@ EXPORT_SYMBOL(creds_are_invalid);
static void dump_invalid_creds(const struct cred *cred, const char *label,
const struct task_struct *tsk)
{
- printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n",
+ pr_err("%s credentials: %p %s%s%s\n",
label, cred,
cred == &init_cred ? "[init]" : "",
cred == tsk->real_cred ? "[real]" : "",
cred == tsk->cred ? "[eff]" : "");
- printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n",
+ pr_err("->magic=%x, put_addr=%p\n",
cred->magic, cred->put_addr);
- printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n",
+ pr_err("->usage=%d, subscr=%d\n",
atomic_read(&cred->usage),
read_cred_subscribers(cred));
- printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
+ pr_err("->*uid = { %d,%d,%d,%d }\n",
from_kuid_munged(&init_user_ns, cred->uid),
from_kuid_munged(&init_user_ns, cred->euid),
from_kuid_munged(&init_user_ns, cred->suid),
from_kuid_munged(&init_user_ns, cred->fsuid));
- printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
+ pr_err("->*gid = { %d,%d,%d,%d }\n",
from_kgid_munged(&init_user_ns, cred->gid),
from_kgid_munged(&init_user_ns, cred->egid),
from_kgid_munged(&init_user_ns, cred->sgid),
from_kgid_munged(&init_user_ns, cred->fsgid));
#ifdef CONFIG_SECURITY
- printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
+ pr_err("->security is %p\n", cred->security);
if ((unsigned long) cred->security >= PAGE_SIZE &&
(((unsigned long) cred->security & 0xffffff00) !=
(POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)))
- printk(KERN_ERR "CRED: ->security {%x, %x}\n",
+ pr_err("->security {%x, %x}\n",
((u32*)cred->security)[0],
((u32*)cred->security)[1]);
#endif
@@ -871,8 +874,8 @@ static void dump_invalid_creds(const struct cred *cred, const char *label,
*/
void __noreturn __invalid_creds(const struct cred *cred, const char *file, unsigned line)
{
- printk(KERN_ERR "CRED: Invalid credentials\n");
- printk(KERN_ERR "CRED: At %s:%u\n", file, line);
+ pr_err("Invalid credentials\n");
+ pr_err("At %s:%u\n", file, line);
dump_invalid_creds(cred, "Specified", current);
BUG();
}
@@ -898,14 +901,14 @@ void __validate_process_creds(struct task_struct *tsk,
return;
invalid_creds:
- printk(KERN_ERR "CRED: Invalid process credentials\n");
- printk(KERN_ERR "CRED: At %s:%u\n", file, line);
+ pr_err("Invalid process credentials\n");
+ pr_err("At %s:%u\n", file, line);
dump_invalid_creds(tsk->real_cred, "Real", tsk);
if (tsk->cred != tsk->real_cred)
dump_invalid_creds(tsk->cred, "Effective", tsk);
else
- printk(KERN_ERR "CRED: Effective creds == Real creds\n");
+ pr_err("Effective creds == Real creds\n");
BUG();
}
EXPORT_SYMBOL(__validate_process_creds);
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index d5e9ccde3ab8..621037a0aa87 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -968,7 +968,7 @@ static int __init opt_kgdb_con(char *str)
early_param("kgdbcon", opt_kgdb_con);
#ifdef CONFIG_MAGIC_SYSRQ
-static void sysrq_handle_dbg(int key)
+static void sysrq_handle_dbg(u8 key)
{
if (!dbg_io_ops) {
pr_crit("ERROR: No KGDB I/O module available\n");
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 813cb6cf72d6..9443bc63c5a2 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -590,6 +590,8 @@ static void kdb_msg_write(const char *msg, int msg_len)
continue;
if (c == dbg_io_ops->cons)
continue;
+ if (!c->write)
+ continue;
/*
* Set oops_in_progress to encourage the console drivers to
* disregard their internal spin locks: in the current calling
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 11d077003205..4c1e9a3c0ab6 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -90,6 +90,19 @@ config SWIOTLB
bool
select NEED_DMA_MAP_STATE
+config SWIOTLB_DYNAMIC
+ bool "Dynamic allocation of DMA bounce buffers"
+ default n
+ depends on SWIOTLB
+ help
+ This enables dynamic resizing of the software IO TLB. The kernel
+ starts with one memory pool at boot and it will allocate additional
+ pools as needed. To reduce run-time kernel memory requirements, you
+ may have to specify a smaller size of the initial pool using
+ "swiotlb=" on the kernel command line.
+
+ If unsure, say N.
+
config DMA_BOUNCE_UNALIGNED_KMALLOC
bool
depends on SWIOTLB
@@ -145,15 +158,16 @@ config DMA_CMA
if DMA_CMA
-config DMA_PERNUMA_CMA
- bool "Enable separate DMA Contiguous Memory Area for each NUMA Node"
- default NUMA && ARM64
+config DMA_NUMA_CMA
+ bool "Enable separate DMA Contiguous Memory Area for NUMA Node"
+ default NUMA
help
- Enable this option to get pernuma CMA areas so that devices like
- ARM64 SMMU can get local memory by DMA coherent APIs.
+ Enable this option to get numa CMA areas so that NUMA devices
+ can get local memory by DMA coherent APIs.
You can set the size of pernuma CMA by specifying "cma_pernuma=size"
- on the kernel's command line.
+ or set the node id and its size of CMA by specifying "numa_cma=
+ <node>:size[,<node>:size]" on the kernel's command line.
comment "Default contiguous memory area size:"
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 6ea80ae42622..88c595e49e34 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -50,6 +50,7 @@
#include <linux/sizes.h>
#include <linux/dma-map-ops.h>
#include <linux/cma.h>
+#include <linux/nospec.h>
#ifdef CONFIG_CMA_SIZE_MBYTES
#define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES
@@ -96,11 +97,44 @@ static int __init early_cma(char *p)
}
early_param("cma", early_cma);
-#ifdef CONFIG_DMA_PERNUMA_CMA
+#ifdef CONFIG_DMA_NUMA_CMA
+static struct cma *dma_contiguous_numa_area[MAX_NUMNODES];
+static phys_addr_t numa_cma_size[MAX_NUMNODES] __initdata;
static struct cma *dma_contiguous_pernuma_area[MAX_NUMNODES];
static phys_addr_t pernuma_size_bytes __initdata;
+static int __init early_numa_cma(char *p)
+{
+ int nid, count = 0;
+ unsigned long tmp;
+ char *s = p;
+
+ while (*s) {
+ if (sscanf(s, "%lu%n", &tmp, &count) != 1)
+ break;
+
+ if (s[count] == ':') {
+ if (tmp >= MAX_NUMNODES)
+ break;
+ nid = array_index_nospec(tmp, MAX_NUMNODES);
+
+ s += count + 1;
+ tmp = memparse(s, &s);
+ numa_cma_size[nid] = tmp;
+
+ if (*s == ',')
+ s++;
+ else
+ break;
+ } else
+ break;
+ }
+
+ return 0;
+}
+early_param("numa_cma", early_numa_cma);
+
static int __init early_cma_pernuma(char *p)
{
pernuma_size_bytes = memparse(p, &p);
@@ -127,32 +161,49 @@ static inline __maybe_unused phys_addr_t cma_early_percent_memory(void)
#endif
-#ifdef CONFIG_DMA_PERNUMA_CMA
-void __init dma_pernuma_cma_reserve(void)
+#ifdef CONFIG_DMA_NUMA_CMA
+static void __init dma_numa_cma_reserve(void)
{
int nid;
- if (!pernuma_size_bytes)
- return;
-
- for_each_online_node(nid) {
+ for_each_node(nid) {
int ret;
char name[CMA_MAX_NAME];
- struct cma **cma = &dma_contiguous_pernuma_area[nid];
-
- snprintf(name, sizeof(name), "pernuma%d", nid);
- ret = cma_declare_contiguous_nid(0, pernuma_size_bytes, 0, 0,
- 0, false, name, cma, nid);
- if (ret) {
- pr_warn("%s: reservation failed: err %d, node %d", __func__,
- ret, nid);
+ struct cma **cma;
+
+ if (!node_online(nid)) {
+ if (pernuma_size_bytes || numa_cma_size[nid])
+ pr_warn("invalid node %d specified\n", nid);
continue;
}
- pr_debug("%s: reserved %llu MiB on node %d\n", __func__,
- (unsigned long long)pernuma_size_bytes / SZ_1M, nid);
+ if (pernuma_size_bytes) {
+
+ cma = &dma_contiguous_pernuma_area[nid];
+ snprintf(name, sizeof(name), "pernuma%d", nid);
+ ret = cma_declare_contiguous_nid(0, pernuma_size_bytes, 0, 0,
+ 0, false, name, cma, nid);
+ if (ret)
+ pr_warn("%s: reservation failed: err %d, node %d", __func__,
+ ret, nid);
+ }
+
+ if (numa_cma_size[nid]) {
+
+ cma = &dma_contiguous_numa_area[nid];
+ snprintf(name, sizeof(name), "numa%d", nid);
+ ret = cma_declare_contiguous_nid(0, numa_cma_size[nid], 0, 0, 0, false,
+ name, cma, nid);
+ if (ret)
+ pr_warn("%s: reservation failed: err %d, node %d", __func__,
+ ret, nid);
+ }
}
}
+#else
+static inline void __init dma_numa_cma_reserve(void)
+{
+}
#endif
/**
@@ -171,6 +222,8 @@ void __init dma_contiguous_reserve(phys_addr_t limit)
phys_addr_t selected_limit = limit;
bool fixed = false;
+ dma_numa_cma_reserve();
+
pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit);
if (size_cmdline != -1) {
@@ -303,7 +356,7 @@ static struct page *cma_alloc_aligned(struct cma *cma, size_t size, gfp_t gfp)
*/
struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
{
-#ifdef CONFIG_DMA_PERNUMA_CMA
+#ifdef CONFIG_DMA_NUMA_CMA
int nid = dev_to_node(dev);
#endif
@@ -315,7 +368,7 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
if (size <= PAGE_SIZE)
return NULL;
-#ifdef CONFIG_DMA_PERNUMA_CMA
+#ifdef CONFIG_DMA_NUMA_CMA
if (nid != NUMA_NO_NODE && !(gfp & (GFP_DMA | GFP_DMA32))) {
struct cma *cma = dma_contiguous_pernuma_area[nid];
struct page *page;
@@ -325,6 +378,13 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
if (page)
return page;
}
+
+ cma = dma_contiguous_numa_area[nid];
+ if (cma) {
+ page = cma_alloc_aligned(cma, size, gfp);
+ if (page)
+ return page;
+ }
}
#endif
if (!dma_contiguous_default_area)
@@ -356,10 +416,13 @@ void dma_free_contiguous(struct device *dev, struct page *page, size_t size)
/*
* otherwise, page is from either per-numa cma or default cma
*/
-#ifdef CONFIG_DMA_PERNUMA_CMA
+#ifdef CONFIG_DMA_NUMA_CMA
if (cma_release(dma_contiguous_pernuma_area[page_to_nid(page)],
page, count))
return;
+ if (cma_release(dma_contiguous_numa_area[page_to_nid(page)],
+ page, count))
+ return;
#endif
if (cma_release(dma_contiguous_default_area, page, count))
return;
@@ -410,6 +473,11 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
return -EBUSY;
}
+ if (memblock_is_region_reserved(rmem->base, rmem->size)) {
+ pr_info("Reserved memory: overlap with other memblock reserved region\n");
+ return -EBUSY;
+ }
+
if (!of_get_flat_dt_prop(node, "reusable", NULL) ||
of_get_flat_dt_prop(node, "no-map", NULL))
return -EINVAL;
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index d29cade048db..9596ae1aa0da 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -66,7 +66,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 *phys_limit)
return 0;
}
-static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
+bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
{
dma_addr_t dma_addr = phys_to_dma_direct(dev, phys);
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 9a4db5cce600..e323ca48f7f2 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -760,12 +760,6 @@ bool dma_pci_p2pdma_supported(struct device *dev)
}
EXPORT_SYMBOL_GPL(dma_pci_p2pdma_supported);
-#ifdef CONFIG_ARCH_HAS_DMA_SET_MASK
-void arch_dma_set_mask(struct device *dev, u64 mask);
-#else
-#define arch_dma_set_mask(dev, mask) do { } while (0)
-#endif
-
int dma_set_mask(struct device *dev, u64 mask)
{
/*
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 2b83e3ad9dca..394494a6b1f3 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -35,6 +35,7 @@
#include <linux/memblock.h>
#include <linux/mm.h>
#include <linux/pfn.h>
+#include <linux/rculist.h>
#include <linux/scatterlist.h>
#include <linux/set_memory.h>
#include <linux/spinlock.h>
@@ -62,6 +63,13 @@
#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
+/**
+ * struct io_tlb_slot - IO TLB slot descriptor
+ * @orig_addr: The original address corresponding to a mapped entry.
+ * @alloc_size: Size of the allocated buffer.
+ * @list: The free list describing the number of free entries available
+ * from each index.
+ */
struct io_tlb_slot {
phys_addr_t orig_addr;
size_t alloc_size;
@@ -71,7 +79,22 @@ struct io_tlb_slot {
static bool swiotlb_force_bounce;
static bool swiotlb_force_disable;
-struct io_tlb_mem io_tlb_default_mem;
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+
+static void swiotlb_dyn_alloc(struct work_struct *work);
+
+static struct io_tlb_mem io_tlb_default_mem = {
+ .lock = __SPIN_LOCK_UNLOCKED(io_tlb_default_mem.lock),
+ .pools = LIST_HEAD_INIT(io_tlb_default_mem.pools),
+ .dyn_alloc = __WORK_INITIALIZER(io_tlb_default_mem.dyn_alloc,
+ swiotlb_dyn_alloc),
+};
+
+#else /* !CONFIG_SWIOTLB_DYNAMIC */
+
+static struct io_tlb_mem io_tlb_default_mem;
+
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;
static unsigned long default_nareas;
@@ -202,7 +225,7 @@ void __init swiotlb_adjust_size(unsigned long size)
void swiotlb_print_info(void)
{
- struct io_tlb_mem *mem = &io_tlb_default_mem;
+ struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
if (!mem->nslabs) {
pr_warn("No low mem\n");
@@ -231,7 +254,7 @@ static inline unsigned long nr_slots(u64 val)
*/
void __init swiotlb_update_mem_attributes(void)
{
- struct io_tlb_mem *mem = &io_tlb_default_mem;
+ struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
unsigned long bytes;
if (!mem->nslabs || mem->late_alloc)
@@ -240,9 +263,8 @@ void __init swiotlb_update_mem_attributes(void)
set_memory_decrypted((unsigned long)mem->vaddr, bytes >> PAGE_SHIFT);
}
-static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
- unsigned long nslabs, unsigned int flags,
- bool late_alloc, unsigned int nareas)
+static void swiotlb_init_io_tlb_pool(struct io_tlb_pool *mem, phys_addr_t start,
+ unsigned long nslabs, bool late_alloc, unsigned int nareas)
{
void *vaddr = phys_to_virt(start);
unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
@@ -254,8 +276,6 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
mem->nareas = nareas;
mem->area_nslabs = nslabs / mem->nareas;
- mem->force_bounce = swiotlb_force_bounce || (flags & SWIOTLB_FORCE);
-
for (i = 0; i < mem->nareas; i++) {
spin_lock_init(&mem->areas[i].lock);
mem->areas[i].index = 0;
@@ -273,6 +293,23 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
return;
}
+/**
+ * add_mem_pool() - add a memory pool to the allocator
+ * @mem: Software IO TLB allocator.
+ * @pool: Memory pool to be added.
+ */
+static void add_mem_pool(struct io_tlb_mem *mem, struct io_tlb_pool *pool)
+{
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ spin_lock(&mem->lock);
+ list_add_rcu(&pool->node, &mem->pools);
+ mem->nslabs += pool->nslabs;
+ spin_unlock(&mem->lock);
+#else
+ mem->nslabs = pool->nslabs;
+#endif
+}
+
static void __init *swiotlb_memblock_alloc(unsigned long nslabs,
unsigned int flags,
int (*remap)(void *tlb, unsigned long nslabs))
@@ -312,7 +349,7 @@ static void __init *swiotlb_memblock_alloc(unsigned long nslabs,
void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
int (*remap)(void *tlb, unsigned long nslabs))
{
- struct io_tlb_mem *mem = &io_tlb_default_mem;
+ struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
unsigned long nslabs;
unsigned int nareas;
size_t alloc_size;
@@ -323,6 +360,18 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
if (swiotlb_force_disable)
return;
+ io_tlb_default_mem.force_bounce =
+ swiotlb_force_bounce || (flags & SWIOTLB_FORCE);
+
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ if (!remap)
+ io_tlb_default_mem.can_grow = true;
+ if (flags & SWIOTLB_ANY)
+ io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1);
+ else
+ io_tlb_default_mem.phys_limit = ARCH_LOW_ADDRESS_LIMIT;
+#endif
+
if (!default_nareas)
swiotlb_adjust_nareas(num_possible_cpus());
@@ -356,8 +405,9 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
return;
}
- swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, flags, false,
- default_nareas);
+ swiotlb_init_io_tlb_pool(mem, __pa(tlb), nslabs, false,
+ default_nareas);
+ add_mem_pool(&io_tlb_default_mem, mem);
if (flags & SWIOTLB_VERBOSE)
swiotlb_print_info();
@@ -376,7 +426,7 @@ void __init swiotlb_init(bool addressing_limit, unsigned int flags)
int swiotlb_init_late(size_t size, gfp_t gfp_mask,
int (*remap)(void *tlb, unsigned long nslabs))
{
- struct io_tlb_mem *mem = &io_tlb_default_mem;
+ struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
unsigned int nareas;
unsigned char *vstart = NULL;
@@ -384,9 +434,25 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
bool retried = false;
int rc = 0;
+ if (io_tlb_default_mem.nslabs)
+ return 0;
+
if (swiotlb_force_disable)
return 0;
+ io_tlb_default_mem.force_bounce = swiotlb_force_bounce;
+
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ if (!remap)
+ io_tlb_default_mem.can_grow = true;
+ if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA))
+ io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits);
+ else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32))
+ io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32);
+ else
+ io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1);
+#endif
+
if (!default_nareas)
swiotlb_adjust_nareas(num_possible_cpus());
@@ -438,8 +504,9 @@ retry:
set_memory_decrypted((unsigned long)vstart,
(nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT);
- swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, 0, true,
- nareas);
+ swiotlb_init_io_tlb_pool(mem, virt_to_phys(vstart), nslabs, true,
+ nareas);
+ add_mem_pool(&io_tlb_default_mem, mem);
swiotlb_print_info();
return 0;
@@ -453,7 +520,7 @@ error_area:
void __init swiotlb_exit(void)
{
- struct io_tlb_mem *mem = &io_tlb_default_mem;
+ struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
unsigned long tbl_vaddr;
size_t tbl_size, slots_size;
unsigned int area_order;
@@ -486,6 +553,265 @@ void __init swiotlb_exit(void)
memset(mem, 0, sizeof(*mem));
}
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+
+/**
+ * alloc_dma_pages() - allocate pages to be used for DMA
+ * @gfp: GFP flags for the allocation.
+ * @bytes: Size of the buffer.
+ *
+ * Allocate pages from the buddy allocator. If successful, make the allocated
+ * pages decrypted that they can be used for DMA.
+ *
+ * Return: Decrypted pages, or %NULL on failure.
+ */
+static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes)
+{
+ unsigned int order = get_order(bytes);
+ struct page *page;
+ void *vaddr;
+
+ page = alloc_pages(gfp, order);
+ if (!page)
+ return NULL;
+
+ vaddr = page_address(page);
+ if (set_memory_decrypted((unsigned long)vaddr, PFN_UP(bytes)))
+ goto error;
+ return page;
+
+error:
+ __free_pages(page, order);
+ return NULL;
+}
+
+/**
+ * swiotlb_alloc_tlb() - allocate a dynamic IO TLB buffer
+ * @dev: Device for which a memory pool is allocated.
+ * @bytes: Size of the buffer.
+ * @phys_limit: Maximum allowed physical address of the buffer.
+ * @gfp: GFP flags for the allocation.
+ *
+ * Return: Allocated pages, or %NULL on allocation failure.
+ */
+static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
+ u64 phys_limit, gfp_t gfp)
+{
+ struct page *page;
+
+ /*
+ * Allocate from the atomic pools if memory is encrypted and
+ * the allocation is atomic, because decrypting may block.
+ */
+ if (!gfpflags_allow_blocking(gfp) && dev && force_dma_unencrypted(dev)) {
+ void *vaddr;
+
+ if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
+ return NULL;
+
+ return dma_alloc_from_pool(dev, bytes, &vaddr, gfp,
+ dma_coherent_ok);
+ }
+
+ gfp &= ~GFP_ZONEMASK;
+ if (phys_limit <= DMA_BIT_MASK(zone_dma_bits))
+ gfp |= __GFP_DMA;
+ else if (phys_limit <= DMA_BIT_MASK(32))
+ gfp |= __GFP_DMA32;
+
+ while ((page = alloc_dma_pages(gfp, bytes)) &&
+ page_to_phys(page) + bytes - 1 > phys_limit) {
+ /* allocated, but too high */
+ __free_pages(page, get_order(bytes));
+
+ if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
+ phys_limit < DMA_BIT_MASK(64) &&
+ !(gfp & (__GFP_DMA32 | __GFP_DMA)))
+ gfp |= __GFP_DMA32;
+ else if (IS_ENABLED(CONFIG_ZONE_DMA) &&
+ !(gfp & __GFP_DMA))
+ gfp = (gfp & ~__GFP_DMA32) | __GFP_DMA;
+ else
+ return NULL;
+ }
+
+ return page;
+}
+
+/**
+ * swiotlb_free_tlb() - free a dynamically allocated IO TLB buffer
+ * @vaddr: Virtual address of the buffer.
+ * @bytes: Size of the buffer.
+ */
+static void swiotlb_free_tlb(void *vaddr, size_t bytes)
+{
+ if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
+ dma_free_from_pool(NULL, vaddr, bytes))
+ return;
+
+ /* Intentional leak if pages cannot be encrypted again. */
+ if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
+ __free_pages(virt_to_page(vaddr), get_order(bytes));
+}
+
+/**
+ * swiotlb_alloc_pool() - allocate a new IO TLB memory pool
+ * @dev: Device for which a memory pool is allocated.
+ * @minslabs: Minimum number of slabs.
+ * @nslabs: Desired (maximum) number of slabs.
+ * @nareas: Number of areas.
+ * @phys_limit: Maximum DMA buffer physical address.
+ * @gfp: GFP flags for the allocations.
+ *
+ * Allocate and initialize a new IO TLB memory pool. The actual number of
+ * slabs may be reduced if allocation of @nslabs fails. If even
+ * @minslabs cannot be allocated, this function fails.
+ *
+ * Return: New memory pool, or %NULL on allocation failure.
+ */
+static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev,
+ unsigned long minslabs, unsigned long nslabs,
+ unsigned int nareas, u64 phys_limit, gfp_t gfp)
+{
+ struct io_tlb_pool *pool;
+ unsigned int slot_order;
+ struct page *tlb;
+ size_t pool_size;
+ size_t tlb_size;
+
+ pool_size = sizeof(*pool) + array_size(sizeof(*pool->areas), nareas);
+ pool = kzalloc(pool_size, gfp);
+ if (!pool)
+ goto error;
+ pool->areas = (void *)pool + sizeof(*pool);
+
+ tlb_size = nslabs << IO_TLB_SHIFT;
+ while (!(tlb = swiotlb_alloc_tlb(dev, tlb_size, phys_limit, gfp))) {
+ if (nslabs <= minslabs)
+ goto error_tlb;
+ nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
+ nareas = limit_nareas(nareas, nslabs);
+ tlb_size = nslabs << IO_TLB_SHIFT;
+ }
+
+ slot_order = get_order(array_size(sizeof(*pool->slots), nslabs));
+ pool->slots = (struct io_tlb_slot *)
+ __get_free_pages(gfp, slot_order);
+ if (!pool->slots)
+ goto error_slots;
+
+ swiotlb_init_io_tlb_pool(pool, page_to_phys(tlb), nslabs, true, nareas);
+ return pool;
+
+error_slots:
+ swiotlb_free_tlb(page_address(tlb), tlb_size);
+error_tlb:
+ kfree(pool);
+error:
+ return NULL;
+}
+
+/**
+ * swiotlb_dyn_alloc() - dynamic memory pool allocation worker
+ * @work: Pointer to dyn_alloc in struct io_tlb_mem.
+ */
+static void swiotlb_dyn_alloc(struct work_struct *work)
+{
+ struct io_tlb_mem *mem =
+ container_of(work, struct io_tlb_mem, dyn_alloc);
+ struct io_tlb_pool *pool;
+
+ pool = swiotlb_alloc_pool(NULL, IO_TLB_MIN_SLABS, default_nslabs,
+ default_nareas, mem->phys_limit, GFP_KERNEL);
+ if (!pool) {
+ pr_warn_ratelimited("Failed to allocate new pool");
+ return;
+ }
+
+ add_mem_pool(mem, pool);
+
+ /* Pairs with smp_rmb() in is_swiotlb_buffer(). */
+ smp_wmb();
+}
+
+/**
+ * swiotlb_dyn_free() - RCU callback to free a memory pool
+ * @rcu: RCU head in the corresponding struct io_tlb_pool.
+ */
+static void swiotlb_dyn_free(struct rcu_head *rcu)
+{
+ struct io_tlb_pool *pool = container_of(rcu, struct io_tlb_pool, rcu);
+ size_t slots_size = array_size(sizeof(*pool->slots), pool->nslabs);
+ size_t tlb_size = pool->end - pool->start;
+
+ free_pages((unsigned long)pool->slots, get_order(slots_size));
+ swiotlb_free_tlb(pool->vaddr, tlb_size);
+ kfree(pool);
+}
+
+/**
+ * swiotlb_find_pool() - find the IO TLB pool for a physical address
+ * @dev: Device which has mapped the DMA buffer.
+ * @paddr: Physical address within the DMA buffer.
+ *
+ * Find the IO TLB memory pool descriptor which contains the given physical
+ * address, if any.
+ *
+ * Return: Memory pool which contains @paddr, or %NULL if none.
+ */
+struct io_tlb_pool *swiotlb_find_pool(struct device *dev, phys_addr_t paddr)
+{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_pool *pool;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pool, &mem->pools, node) {
+ if (paddr >= pool->start && paddr < pool->end)
+ goto out;
+ }
+
+ list_for_each_entry_rcu(pool, &dev->dma_io_tlb_pools, node) {
+ if (paddr >= pool->start && paddr < pool->end)
+ goto out;
+ }
+ pool = NULL;
+out:
+ rcu_read_unlock();
+ return pool;
+}
+
+/**
+ * swiotlb_del_pool() - remove an IO TLB pool from a device
+ * @dev: Owning device.
+ * @pool: Memory pool to be removed.
+ */
+static void swiotlb_del_pool(struct device *dev, struct io_tlb_pool *pool)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->dma_io_tlb_lock, flags);
+ list_del_rcu(&pool->node);
+ spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags);
+
+ call_rcu(&pool->rcu, swiotlb_dyn_free);
+}
+
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
+
+/**
+ * swiotlb_dev_init() - initialize swiotlb fields in &struct device
+ * @dev: Device to be initialized.
+ */
+void swiotlb_dev_init(struct device *dev)
+{
+ dev->dma_io_tlb_mem = &io_tlb_default_mem;
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ INIT_LIST_HEAD(&dev->dma_io_tlb_pools);
+ spin_lock_init(&dev->dma_io_tlb_lock);
+ dev->dma_uses_io_tlb = false;
+#endif
+}
+
/*
* Return the offset into a iotlb slot required to keep the device happy.
*/
@@ -500,7 +826,7 @@ static unsigned int swiotlb_align_offset(struct device *dev, u64 addr)
static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size,
enum dma_data_direction dir)
{
- struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_pool *mem = swiotlb_find_pool(dev, tlb_addr);
int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT;
phys_addr_t orig_addr = mem->slots[index].orig_addr;
size_t alloc_size = mem->slots[index].alloc_size;
@@ -577,12 +903,10 @@ static inline phys_addr_t slot_addr(phys_addr_t start, phys_addr_t idx)
*/
static inline unsigned long get_max_slots(unsigned long boundary_mask)
{
- if (boundary_mask == ~0UL)
- return 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
- return nr_slots(boundary_mask + 1);
+ return (boundary_mask >> IO_TLB_SHIFT) + 1;
}
-static unsigned int wrap_area_index(struct io_tlb_mem *mem, unsigned int index)
+static unsigned int wrap_area_index(struct io_tlb_pool *mem, unsigned int index)
{
if (index >= mem->area_nslabs)
return 0;
@@ -623,19 +947,30 @@ static void dec_used(struct io_tlb_mem *mem, unsigned int nslots)
}
#endif /* CONFIG_DEBUG_FS */
-/*
- * Find a suitable number of IO TLB entries size that will fit this request and
- * allocate a buffer from that IO TLB pool.
+/**
+ * swiotlb_area_find_slots() - search for slots in one IO TLB memory area
+ * @dev: Device which maps the buffer.
+ * @pool: Memory pool to be searched.
+ * @area_index: Index of the IO TLB memory area to be searched.
+ * @orig_addr: Original (non-bounced) IO buffer address.
+ * @alloc_size: Total requested size of the bounce buffer,
+ * including initial alignment padding.
+ * @alloc_align_mask: Required alignment of the allocated buffer.
+ *
+ * Find a suitable sequence of IO TLB entries for the request and allocate
+ * a buffer from the given IO TLB memory area.
+ * This function takes care of locking.
+ *
+ * Return: Index of the first allocated slot, or -1 on error.
*/
-static int swiotlb_do_find_slots(struct device *dev, int area_index,
- phys_addr_t orig_addr, size_t alloc_size,
+static int swiotlb_area_find_slots(struct device *dev, struct io_tlb_pool *pool,
+ int area_index, phys_addr_t orig_addr, size_t alloc_size,
unsigned int alloc_align_mask)
{
- struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
- struct io_tlb_area *area = mem->areas + area_index;
+ struct io_tlb_area *area = pool->areas + area_index;
unsigned long boundary_mask = dma_get_seg_boundary(dev);
dma_addr_t tbl_dma_addr =
- phys_to_dma_unencrypted(dev, mem->start) & boundary_mask;
+ phys_to_dma_unencrypted(dev, pool->start) & boundary_mask;
unsigned long max_slots = get_max_slots(boundary_mask);
unsigned int iotlb_align_mask =
dma_get_min_align_mask(dev) | alloc_align_mask;
@@ -647,7 +982,7 @@ static int swiotlb_do_find_slots(struct device *dev, int area_index,
unsigned int slot_index;
BUG_ON(!nslots);
- BUG_ON(area_index >= mem->nareas);
+ BUG_ON(area_index >= pool->nareas);
/*
* For allocations of PAGE_SIZE or larger only look for page aligned
@@ -664,35 +999,30 @@ static int swiotlb_do_find_slots(struct device *dev, int area_index,
stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1;
spin_lock_irqsave(&area->lock, flags);
- if (unlikely(nslots > mem->area_nslabs - area->used))
+ if (unlikely(nslots > pool->area_nslabs - area->used))
goto not_found;
- slot_base = area_index * mem->area_nslabs;
+ slot_base = area_index * pool->area_nslabs;
index = area->index;
- for (slots_checked = 0; slots_checked < mem->area_nslabs; ) {
+ for (slots_checked = 0; slots_checked < pool->area_nslabs; ) {
slot_index = slot_base + index;
if (orig_addr &&
(slot_addr(tbl_dma_addr, slot_index) &
iotlb_align_mask) != (orig_addr & iotlb_align_mask)) {
- index = wrap_area_index(mem, index + 1);
+ index = wrap_area_index(pool, index + 1);
slots_checked++;
continue;
}
- /*
- * If we find a slot that indicates we have 'nslots' number of
- * contiguous buffers, we allocate the buffers from that slot
- * and mark the entries as '0' indicating unavailable.
- */
if (!iommu_is_span_boundary(slot_index, nslots,
nr_slots(tbl_dma_addr),
max_slots)) {
- if (mem->slots[slot_index].list >= nslots)
+ if (pool->slots[slot_index].list >= nslots)
goto found;
}
- index = wrap_area_index(mem, index + stride);
+ index = wrap_area_index(pool, index + stride);
slots_checked += stride;
}
@@ -701,48 +1031,159 @@ not_found:
return -1;
found:
+ /*
+ * If we find a slot that indicates we have 'nslots' number of
+ * contiguous buffers, we allocate the buffers from that slot onwards
+ * and set the list of free entries to '0' indicating unavailable.
+ */
for (i = slot_index; i < slot_index + nslots; i++) {
- mem->slots[i].list = 0;
- mem->slots[i].alloc_size = alloc_size - (offset +
+ pool->slots[i].list = 0;
+ pool->slots[i].alloc_size = alloc_size - (offset +
((i - slot_index) << IO_TLB_SHIFT));
}
for (i = slot_index - 1;
io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
- mem->slots[i].list; i--)
- mem->slots[i].list = ++count;
+ pool->slots[i].list; i--)
+ pool->slots[i].list = ++count;
/*
* Update the indices to avoid searching in the next round.
*/
- area->index = wrap_area_index(mem, index + nslots);
+ area->index = wrap_area_index(pool, index + nslots);
area->used += nslots;
spin_unlock_irqrestore(&area->lock, flags);
- inc_used_and_hiwater(mem, nslots);
+ inc_used_and_hiwater(dev->dma_io_tlb_mem, nslots);
return slot_index;
}
-static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
- size_t alloc_size, unsigned int alloc_align_mask)
+/**
+ * swiotlb_pool_find_slots() - search for slots in one memory pool
+ * @dev: Device which maps the buffer.
+ * @pool: Memory pool to be searched.
+ * @orig_addr: Original (non-bounced) IO buffer address.
+ * @alloc_size: Total requested size of the bounce buffer,
+ * including initial alignment padding.
+ * @alloc_align_mask: Required alignment of the allocated buffer.
+ *
+ * Search through one memory pool to find a sequence of slots that match the
+ * allocation constraints.
+ *
+ * Return: Index of the first allocated slot, or -1 on error.
+ */
+static int swiotlb_pool_find_slots(struct device *dev, struct io_tlb_pool *pool,
+ phys_addr_t orig_addr, size_t alloc_size,
+ unsigned int alloc_align_mask)
{
- struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
- int start = raw_smp_processor_id() & (mem->nareas - 1);
+ int start = raw_smp_processor_id() & (pool->nareas - 1);
int i = start, index;
do {
- index = swiotlb_do_find_slots(dev, i, orig_addr, alloc_size,
- alloc_align_mask);
+ index = swiotlb_area_find_slots(dev, pool, i, orig_addr,
+ alloc_size, alloc_align_mask);
if (index >= 0)
return index;
- if (++i >= mem->nareas)
+ if (++i >= pool->nareas)
i = 0;
} while (i != start);
return -1;
}
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+
+/**
+ * swiotlb_find_slots() - search for slots in the whole swiotlb
+ * @dev: Device which maps the buffer.
+ * @orig_addr: Original (non-bounced) IO buffer address.
+ * @alloc_size: Total requested size of the bounce buffer,
+ * including initial alignment padding.
+ * @alloc_align_mask: Required alignment of the allocated buffer.
+ * @retpool: Used memory pool, updated on return.
+ *
+ * Search through the whole software IO TLB to find a sequence of slots that
+ * match the allocation constraints.
+ *
+ * Return: Index of the first allocated slot, or -1 on error.
+ */
+static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
+ size_t alloc_size, unsigned int alloc_align_mask,
+ struct io_tlb_pool **retpool)
+{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_pool *pool;
+ unsigned long nslabs;
+ unsigned long flags;
+ u64 phys_limit;
+ int index;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pool, &mem->pools, node) {
+ index = swiotlb_pool_find_slots(dev, pool, orig_addr,
+ alloc_size, alloc_align_mask);
+ if (index >= 0) {
+ rcu_read_unlock();
+ goto found;
+ }
+ }
+ rcu_read_unlock();
+ if (!mem->can_grow)
+ return -1;
+
+ schedule_work(&mem->dyn_alloc);
+
+ nslabs = nr_slots(alloc_size);
+ phys_limit = min_not_zero(*dev->dma_mask, dev->bus_dma_limit);
+ pool = swiotlb_alloc_pool(dev, nslabs, nslabs, 1, phys_limit,
+ GFP_NOWAIT | __GFP_NOWARN);
+ if (!pool)
+ return -1;
+
+ index = swiotlb_pool_find_slots(dev, pool, orig_addr,
+ alloc_size, alloc_align_mask);
+ if (index < 0) {
+ swiotlb_dyn_free(&pool->rcu);
+ return -1;
+ }
+
+ pool->transient = true;
+ spin_lock_irqsave(&dev->dma_io_tlb_lock, flags);
+ list_add_rcu(&pool->node, &dev->dma_io_tlb_pools);
+ spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags);
+
+found:
+ dev->dma_uses_io_tlb = true;
+ /* Pairs with smp_rmb() in is_swiotlb_buffer() */
+ smp_wmb();
+
+ *retpool = pool;
+ return index;
+}
+
+#else /* !CONFIG_SWIOTLB_DYNAMIC */
+
+static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
+ size_t alloc_size, unsigned int alloc_align_mask,
+ struct io_tlb_pool **retpool)
+{
+ *retpool = &dev->dma_io_tlb_mem->defpool;
+ return swiotlb_pool_find_slots(dev, *retpool,
+ orig_addr, alloc_size, alloc_align_mask);
+}
+
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
+
#ifdef CONFIG_DEBUG_FS
+/**
+ * mem_used() - get number of used slots in an allocator
+ * @mem: Software IO TLB allocator.
+ *
+ * The result is accurate in this version of the function, because an atomic
+ * counter is available if CONFIG_DEBUG_FS is set.
+ *
+ * Return: Number of used slots.
+ */
static unsigned long mem_used(struct io_tlb_mem *mem)
{
return atomic_long_read(&mem->total_used);
@@ -750,14 +1191,48 @@ static unsigned long mem_used(struct io_tlb_mem *mem)
#else /* !CONFIG_DEBUG_FS */
-static unsigned long mem_used(struct io_tlb_mem *mem)
+/**
+ * mem_pool_used() - get number of used slots in a memory pool
+ * @pool: Software IO TLB memory pool.
+ *
+ * The result is not accurate, see mem_used().
+ *
+ * Return: Approximate number of used slots.
+ */
+static unsigned long mem_pool_used(struct io_tlb_pool *pool)
{
int i;
unsigned long used = 0;
- for (i = 0; i < mem->nareas; i++)
- used += mem->areas[i].used;
+ for (i = 0; i < pool->nareas; i++)
+ used += pool->areas[i].used;
+ return used;
+}
+
+/**
+ * mem_used() - get number of used slots in an allocator
+ * @mem: Software IO TLB allocator.
+ *
+ * The result is not accurate, because there is no locking of individual
+ * areas.
+ *
+ * Return: Approximate number of used slots.
+ */
+static unsigned long mem_used(struct io_tlb_mem *mem)
+{
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ struct io_tlb_pool *pool;
+ unsigned long used = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pool, &mem->pools, node)
+ used += mem_pool_used(pool);
+ rcu_read_unlock();
+
return used;
+#else
+ return mem_pool_used(&mem->defpool);
+#endif
}
#endif /* CONFIG_DEBUG_FS */
@@ -769,6 +1244,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
unsigned int offset = swiotlb_align_offset(dev, orig_addr);
+ struct io_tlb_pool *pool;
unsigned int i;
int index;
phys_addr_t tlb_addr;
@@ -789,7 +1265,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
}
index = swiotlb_find_slots(dev, orig_addr,
- alloc_size + offset, alloc_align_mask);
+ alloc_size + offset, alloc_align_mask, &pool);
if (index == -1) {
if (!(attrs & DMA_ATTR_NO_WARN))
dev_warn_ratelimited(dev,
@@ -804,8 +1280,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
* needed.
*/
for (i = 0; i < nr_slots(alloc_size + offset); i++)
- mem->slots[index + i].orig_addr = slot_addr(orig_addr, i);
- tlb_addr = slot_addr(mem->start, index) + offset;
+ pool->slots[index + i].orig_addr = slot_addr(orig_addr, i);
+ tlb_addr = slot_addr(pool->start, index) + offset;
/*
* When dir == DMA_FROM_DEVICE we could omit the copy from the orig
* to the tlb buffer, if we knew for sure the device will
@@ -819,7 +1295,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
{
- struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_pool *mem = swiotlb_find_pool(dev, tlb_addr);
unsigned long flags;
unsigned int offset = swiotlb_align_offset(dev, tlb_addr);
int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
@@ -863,9 +1339,44 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
area->used -= nslots;
spin_unlock_irqrestore(&area->lock, flags);
- dec_used(mem, nslots);
+ dec_used(dev->dma_io_tlb_mem, nslots);
+}
+
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+
+/**
+ * swiotlb_del_transient() - delete a transient memory pool
+ * @dev: Device which mapped the buffer.
+ * @tlb_addr: Physical address within a bounce buffer.
+ *
+ * Check whether the address belongs to a transient SWIOTLB memory pool.
+ * If yes, then delete the pool.
+ *
+ * Return: %true if @tlb_addr belonged to a transient pool that was released.
+ */
+static bool swiotlb_del_transient(struct device *dev, phys_addr_t tlb_addr)
+{
+ struct io_tlb_pool *pool;
+
+ pool = swiotlb_find_pool(dev, tlb_addr);
+ if (!pool->transient)
+ return false;
+
+ dec_used(dev->dma_io_tlb_mem, pool->nslabs);
+ swiotlb_del_pool(dev, pool);
+ return true;
}
+#else /* !CONFIG_SWIOTLB_DYNAMIC */
+
+static inline bool swiotlb_del_transient(struct device *dev,
+ phys_addr_t tlb_addr)
+{
+ return false;
+}
+
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
+
/*
* tlb_addr is the physical address of the bounce buffer to unmap.
*/
@@ -880,6 +1391,8 @@ void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr,
(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_FROM_DEVICE);
+ if (swiotlb_del_transient(dev, tlb_addr))
+ return;
swiotlb_release_slots(dev, tlb_addr);
}
@@ -950,13 +1463,47 @@ size_t swiotlb_max_mapping_size(struct device *dev)
return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE - min_align;
}
+/**
+ * is_swiotlb_allocated() - check if the default software IO TLB is initialized
+ */
+bool is_swiotlb_allocated(void)
+{
+ return io_tlb_default_mem.nslabs;
+}
+
bool is_swiotlb_active(struct device *dev)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
return mem && mem->nslabs;
}
-EXPORT_SYMBOL_GPL(is_swiotlb_active);
+
+/**
+ * default_swiotlb_base() - get the base address of the default SWIOTLB
+ *
+ * Get the lowest physical address used by the default software IO TLB pool.
+ */
+phys_addr_t default_swiotlb_base(void)
+{
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ io_tlb_default_mem.can_grow = false;
+#endif
+ return io_tlb_default_mem.defpool.start;
+}
+
+/**
+ * default_swiotlb_limit() - get the address limit of the default SWIOTLB
+ *
+ * Get the highest physical address used by the default software IO TLB pool.
+ */
+phys_addr_t default_swiotlb_limit(void)
+{
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ return io_tlb_default_mem.phys_limit;
+#else
+ return io_tlb_default_mem.defpool.end - 1;
+#endif
+}
#ifdef CONFIG_DEBUG_FS
@@ -1031,17 +1578,18 @@ static inline void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
struct page *swiotlb_alloc(struct device *dev, size_t size)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_pool *pool;
phys_addr_t tlb_addr;
int index;
if (!mem)
return NULL;
- index = swiotlb_find_slots(dev, 0, size, 0);
+ index = swiotlb_find_slots(dev, 0, size, 0, &pool);
if (index == -1)
return NULL;
- tlb_addr = slot_addr(mem->start, index);
+ tlb_addr = slot_addr(pool->start, index);
return pfn_to_page(PFN_DOWN(tlb_addr));
}
@@ -1078,29 +1626,37 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
* to it.
*/
if (!mem) {
+ struct io_tlb_pool *pool;
+
mem = kzalloc(sizeof(*mem), GFP_KERNEL);
if (!mem)
return -ENOMEM;
+ pool = &mem->defpool;
- mem->slots = kcalloc(nslabs, sizeof(*mem->slots), GFP_KERNEL);
- if (!mem->slots) {
+ pool->slots = kcalloc(nslabs, sizeof(*pool->slots), GFP_KERNEL);
+ if (!pool->slots) {
kfree(mem);
return -ENOMEM;
}
- mem->areas = kcalloc(nareas, sizeof(*mem->areas),
+ pool->areas = kcalloc(nareas, sizeof(*pool->areas),
GFP_KERNEL);
- if (!mem->areas) {
- kfree(mem->slots);
+ if (!pool->areas) {
+ kfree(pool->slots);
kfree(mem);
return -ENOMEM;
}
set_memory_decrypted((unsigned long)phys_to_virt(rmem->base),
rmem->size >> PAGE_SHIFT);
- swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, SWIOTLB_FORCE,
- false, nareas);
+ swiotlb_init_io_tlb_pool(pool, rmem->base, nslabs,
+ false, nareas);
+ mem->force_bounce = true;
mem->for_alloc = true;
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ spin_lock_init(&mem->lock);
+#endif
+ add_mem_pool(mem, pool);
rmem->priv = mem;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 93015cb64d4e..4c72a41f11af 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8631,7 +8631,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
unsigned int size;
char tmp[16];
char *buf = NULL;
- char *name;
+ char *name = NULL;
if (vma->vm_flags & VM_READ)
prot |= PROT_READ;
@@ -8678,29 +8678,18 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
goto got_name;
} else {
- if (vma->vm_ops && vma->vm_ops->name) {
+ if (vma->vm_ops && vma->vm_ops->name)
name = (char *) vma->vm_ops->name(vma);
- if (name)
- goto cpy_name;
+ if (!name)
+ name = (char *)arch_vma_name(vma);
+ if (!name) {
+ if (vma_is_initial_heap(vma))
+ name = "[heap]";
+ else if (vma_is_initial_stack(vma))
+ name = "[stack]";
+ else
+ name = "//anon";
}
-
- name = (char *)arch_vma_name(vma);
- if (name)
- goto cpy_name;
-
- if (vma->vm_start <= vma->vm_mm->start_brk &&
- vma->vm_end >= vma->vm_mm->brk) {
- name = "[heap]";
- goto cpy_name;
- }
- if (vma->vm_start <= vma->vm_mm->start_stack &&
- vma->vm_end >= vma->vm_mm->start_stack) {
- name = "[stack]";
- goto cpy_name;
- }
-
- name = "//anon";
- goto cpy_name;
}
cpy_name:
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index c3797701339c..6c2cb4e4f48d 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -523,26 +523,6 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int we
return 0;
}
-__weak int arch_reserve_bp_slot(struct perf_event *bp)
-{
- return 0;
-}
-
-__weak void arch_release_bp_slot(struct perf_event *bp)
-{
-}
-
-/*
- * Function to perform processor-specific cleanup during unregistration
- */
-__weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
-{
- /*
- * A weak stub function here for those archs that don't define
- * it inside arch/.../kernel/hw_breakpoint.c
- */
-}
-
/*
* Constraints to check before allowing this new breakpoint counter.
*
@@ -594,7 +574,6 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
enum bp_type_idx type;
int max_pinned_slots;
int weight;
- int ret;
/* We couldn't initialize breakpoint constraints on boot */
if (!constraints_initialized)
@@ -613,10 +592,6 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
if (max_pinned_slots > hw_breakpoint_slots_cached(type))
return -ENOSPC;
- ret = arch_reserve_bp_slot(bp);
- if (ret)
- return ret;
-
return toggle_bp_slot(bp, true, type, weight);
}
@@ -634,8 +609,6 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type)
enum bp_type_idx type;
int weight;
- arch_release_bp_slot(bp);
-
type = find_slot_idx(bp_type);
weight = hw_breakpoint_weight(bp);
WARN_ON(toggle_bp_slot(bp, false, type, weight));
@@ -645,7 +618,6 @@ void release_bp_slot(struct perf_event *bp)
{
struct mutex *mtx = bp_constraints_lock(bp);
- arch_unregister_hw_breakpoint(bp);
__release_bp_slot(bp, bp->attr.bp_type);
bp_constraints_unlock(mtx);
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index f0ac5b874919..3048589e2e85 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -193,7 +193,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
}
flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte)));
- ptep_clear_flush_notify(vma, addr, pvmw.pte);
+ ptep_clear_flush(vma, addr, pvmw.pte);
if (new_page)
set_pte_at_notify(mm, addr, pvmw.pte,
mk_pte(new_page, vma->vm_page_prot));
diff --git a/kernel/fork.c b/kernel/fork.c
index f81149739eb9..3b6d20dfb9a8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -909,8 +909,6 @@ static void cleanup_lazy_tlbs(struct mm_struct *mm)
*/
void __mmdrop(struct mm_struct *mm)
{
- int i;
-
BUG_ON(mm == &init_mm);
WARN_ON_ONCE(mm == current->mm);
@@ -925,9 +923,8 @@ void __mmdrop(struct mm_struct *mm)
put_user_ns(mm->user_ns);
mm_pasid_drop(mm);
mm_destroy_cid(mm);
+ percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
- for (i = 0; i < NR_MM_COUNTERS; i++)
- percpu_counter_destroy(&mm->rss_stat[i]);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -1260,8 +1257,6 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
struct user_namespace *user_ns)
{
- int i;
-
mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
atomic_set(&mm->mm_users, 1);
@@ -1309,17 +1304,15 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
if (mm_alloc_cid(mm))
goto fail_cid;
- for (i = 0; i < NR_MM_COUNTERS; i++)
- if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
- goto fail_pcpu;
+ if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
+ NR_MM_COUNTERS))
+ goto fail_pcpu;
mm->user_ns = get_user_ns(user_ns);
lru_gen_init_mm(mm);
return mm;
fail_pcpu:
- while (i > 0)
- percpu_counter_destroy(&mm->rss_stat[--i]);
mm_destroy_cid(mm);
fail_cid:
destroy_context(mm);
@@ -1404,8 +1397,8 @@ EXPORT_SYMBOL_GPL(mmput_async);
* This changes mm's executable file (shown as symlink /proc/[pid]/exe).
*
* Main users are mmput() and sys_execve(). Callers prevent concurrent
- * invocations: in mmput() nobody alive left, in execve task is single
- * threaded.
+ * invocations: in mmput() nobody alive left, in execve it happens before
+ * the new mm is made visible to anyone.
*
* Can only fail if new_exe_file != NULL.
*/
@@ -1440,9 +1433,7 @@ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
/**
* replace_mm_exe_file - replace a reference to the mm's executable file
*
- * This changes mm's executable file (shown as symlink /proc/[pid]/exe),
- * dealing with concurrent invocation and without grabbing the mmap lock in
- * write mode.
+ * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
*
* Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE).
*/
@@ -1472,22 +1463,20 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
return ret;
}
- /* set the new file, lockless */
ret = deny_write_access(new_exe_file);
if (ret)
return -EACCES;
get_file(new_exe_file);
- old_exe_file = xchg(&mm->exe_file, new_exe_file);
+ /* set the new file */
+ mmap_write_lock(mm);
+ old_exe_file = rcu_dereference_raw(mm->exe_file);
+ rcu_assign_pointer(mm->exe_file, new_exe_file);
+ mmap_write_unlock(mm);
+
if (old_exe_file) {
- /*
- * Don't race with dup_mmap() getting the file and disallowing
- * write access while someone might open the file writable.
- */
- mmap_read_lock(mm);
allow_write_access(old_exe_file);
fput(old_exe_file);
- mmap_read_unlock(mm);
}
return 0;
}
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 514e4582b863..f10587d1d481 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -1132,8 +1132,7 @@ static int __init futex_init(void)
#endif
futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
- futex_hashsize, 0,
- futex_hashsize < 256 ? HASH_SMALL : 0,
+ futex_hashsize, 0, 0,
&futex_shift, NULL,
futex_hashsize, futex_hashsize);
futex_hashsize = 1UL << futex_shift;
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 16f8ecc7d882..ccd02afaeffb 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -3,4 +3,6 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
obj-y := base.o fs.o
obj-$(CONFIG_CC_IS_GCC) += gcc_base.o gcc_4_7.o
+CFLAGS_gcc_base.o += -Wno-missing-prototypes -Wno-missing-declarations
obj-$(CONFIG_CC_IS_CLANG) += clang.o
+CFLAGS_clang.o += -Wno-missing-prototypes -Wno-missing-declarations
diff --git a/kernel/iomem.c b/kernel/iomem.c
index 62c92e43aa0d..dc2120776e1c 100644
--- a/kernel/iomem.c
+++ b/kernel/iomem.c
@@ -3,19 +3,16 @@
#include <linux/types.h>
#include <linux/io.h>
#include <linux/mm.h>
-
-#ifndef ioremap_cache
-/* temporary while we convert existing ioremap_cache users to memremap */
-__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
-{
- return ioremap(offset, size);
-}
-#endif
+#include <linux/ioremap.h>
#ifndef arch_memremap_wb
static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
{
+#ifdef ioremap_cache
return (__force void *)ioremap_cache(offset, size);
+#else
+ return (__force void *)ioremap(offset, size);
+#endif
}
#endif
diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
index e05ddc33a752..b4cac76ea5e9 100644
--- a/kernel/kallsyms_selftest.c
+++ b/kernel/kallsyms_selftest.c
@@ -341,6 +341,7 @@ static int test_kallsyms_basic_function(void)
ret = lookup_symbol_name(addr, namebuf);
if (unlikely(ret)) {
namebuf[0] = 0;
+ pr_info("%d: lookup_symbol_name(%lx) failed\n", i, addr);
goto failed;
}
@@ -367,8 +368,11 @@ static int test_kallsyms_basic_function(void)
if (stat->addr != stat2->addr ||
stat->real_cnt != stat2->real_cnt ||
memcmp(stat->addrs, stat2->addrs,
- stat->save_cnt * sizeof(stat->addrs[0])))
+ stat->save_cnt * sizeof(stat->addrs[0]))) {
+ pr_info("%s: mismatch between kallsyms_on_each_symbol() and kallsyms_on_each_match_symbol()\n",
+ namebuf);
goto failed;
+ }
/*
* The average of random increments is 128, that is, one of
@@ -379,15 +383,23 @@ static int test_kallsyms_basic_function(void)
}
/* Need to be found at least once */
- if (!stat->real_cnt)
+ if (!stat->real_cnt) {
+ pr_info("%s: Never found\n", namebuf);
goto failed;
+ }
/*
* kallsyms_lookup_name() returns the address of the first
* symbol found and cannot be NULL.
*/
- if (!lookup_addr || lookup_addr != stat->addrs[0])
+ if (!lookup_addr) {
+ pr_info("%s: NULL lookup_addr?!\n", namebuf);
+ goto failed;
+ }
+ if (lookup_addr != stat->addrs[0]) {
+ pr_info("%s: lookup_addr != stat->addrs[0]\n", namebuf);
goto failed;
+ }
/*
* If the addresses of all matching symbols are recorded, the
@@ -399,8 +411,10 @@ static int test_kallsyms_basic_function(void)
break;
}
- if (j == stat->save_cnt)
+ if (j == stat->save_cnt) {
+ pr_info("%s: j == save_cnt?!\n", namebuf);
goto failed;
+ }
}
}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 92d301f98776..107f355eac10 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -129,6 +129,11 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
if (flags & KEXEC_PRESERVE_CONTEXT)
image->preserve_context = 1;
+#ifdef CONFIG_CRASH_HOTPLUG
+ if (flags & KEXEC_UPDATE_ELFCOREHDR)
+ image->update_elfcorehdr = 1;
+#endif
+
ret = machine_kexec_prepare(image);
if (ret)
goto out;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index e2f2574d8b74..9dc728982d79 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -49,9 +49,6 @@
atomic_t __kexec_lock = ATOMIC_INIT(0);
-/* Per cpu memory for storing cpu states in case of system crash. */
-note_buf_t __percpu *crash_notes;
-
/* Flag to indicate we are going to kexec a new kernel */
bool kexec_in_progress = false;
@@ -277,6 +274,12 @@ struct kimage *do_kimage_alloc_init(void)
/* Initialize the list of unusable pages */
INIT_LIST_HEAD(&image->unusable_pages);
+#ifdef CONFIG_CRASH_HOTPLUG
+ image->hp_action = KEXEC_CRASH_HP_NONE;
+ image->elfcorehdr_index = -1;
+ image->elfcorehdr_updated = false;
+#endif
+
return image;
}
@@ -1218,40 +1221,6 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
final_note(buf);
}
-static int __init crash_notes_memory_init(void)
-{
- /* Allocate memory for saving cpu registers. */
- size_t size, align;
-
- /*
- * crash_notes could be allocated across 2 vmalloc pages when percpu
- * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc
- * pages are also on 2 continuous physical pages. In this case the
- * 2nd part of crash_notes in 2nd page could be lost since only the
- * starting address and size of crash_notes are exported through sysfs.
- * Here round up the size of crash_notes to the nearest power of two
- * and pass it to __alloc_percpu as align value. This can make sure
- * crash_notes is allocated inside one physical page.
- */
- size = sizeof(note_buf_t);
- align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE);
-
- /*
- * Break compile if size is bigger than PAGE_SIZE since crash_notes
- * definitely will be in 2 pages with that.
- */
- BUILD_BUG_ON(size > PAGE_SIZE);
-
- crash_notes = __alloc_percpu(size, align);
- if (!crash_notes) {
- pr_warn("Memory allocation for saving cpu register states failed\n");
- return -ENOMEM;
- }
- return 0;
-}
-subsys_initcall(crash_notes_memory_init);
-
-
/*
* Move into place and start executing a preloaded standalone
* executable. If nothing was preloaded return an error.
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 881ba0d1714c..f9a419cd22d4 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -624,7 +624,7 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
* kexec_add_buffer - place a buffer in a kexec segment
* @kbuf: Buffer contents and memory parameters.
*
- * This function assumes that kexec_mutex is held.
+ * This function assumes that kexec_lock is held.
* On successful return, @kbuf->mem will have the physical address of
* the buffer in memory.
*
@@ -685,7 +685,7 @@ static int kexec_calculate_store_digests(struct kimage *image)
struct kexec_sha_region *sha_regions;
struct purgatory_info *pi = &image->purgatory_info;
- if (!IS_ENABLED(CONFIG_ARCH_HAS_KEXEC_PURGATORY))
+ if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY))
return 0;
zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
@@ -726,6 +726,12 @@ static int kexec_calculate_store_digests(struct kimage *image)
for (j = i = 0; i < image->nr_segments; i++) {
struct kexec_segment *ksegment;
+#ifdef CONFIG_CRASH_HOTPLUG
+ /* Exclude elfcorehdr segment to allow future changes via hotplug */
+ if (j == image->elfcorehdr_index)
+ continue;
+#endif
+
ksegment = &image->segment[i];
/*
* Skip purgatory as it will be modified once we put digest
@@ -790,7 +796,7 @@ out:
return ret;
}
-#ifdef CONFIG_ARCH_HAS_KEXEC_PURGATORY
+#ifdef CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY
/*
* kexec_purgatory_setup_kbuf - prepare buffer to load purgatory.
* @pi: Purgatory to be loaded.
@@ -1150,185 +1156,4 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
return 0;
}
-#endif /* CONFIG_ARCH_HAS_KEXEC_PURGATORY */
-
-int crash_exclude_mem_range(struct crash_mem *mem,
- unsigned long long mstart, unsigned long long mend)
-{
- int i, j;
- unsigned long long start, end, p_start, p_end;
- struct range temp_range = {0, 0};
-
- for (i = 0; i < mem->nr_ranges; i++) {
- start = mem->ranges[i].start;
- end = mem->ranges[i].end;
- p_start = mstart;
- p_end = mend;
-
- if (mstart > end || mend < start)
- continue;
-
- /* Truncate any area outside of range */
- if (mstart < start)
- p_start = start;
- if (mend > end)
- p_end = end;
-
- /* Found completely overlapping range */
- if (p_start == start && p_end == end) {
- mem->ranges[i].start = 0;
- mem->ranges[i].end = 0;
- if (i < mem->nr_ranges - 1) {
- /* Shift rest of the ranges to left */
- for (j = i; j < mem->nr_ranges - 1; j++) {
- mem->ranges[j].start =
- mem->ranges[j+1].start;
- mem->ranges[j].end =
- mem->ranges[j+1].end;
- }
-
- /*
- * Continue to check if there are another overlapping ranges
- * from the current position because of shifting the above
- * mem ranges.
- */
- i--;
- mem->nr_ranges--;
- continue;
- }
- mem->nr_ranges--;
- return 0;
- }
-
- if (p_start > start && p_end < end) {
- /* Split original range */
- mem->ranges[i].end = p_start - 1;
- temp_range.start = p_end + 1;
- temp_range.end = end;
- } else if (p_start != start)
- mem->ranges[i].end = p_start - 1;
- else
- mem->ranges[i].start = p_end + 1;
- break;
- }
-
- /* If a split happened, add the split to array */
- if (!temp_range.end)
- return 0;
-
- /* Split happened */
- if (i == mem->max_nr_ranges - 1)
- return -ENOMEM;
-
- /* Location where new range should go */
- j = i + 1;
- if (j < mem->nr_ranges) {
- /* Move over all ranges one slot towards the end */
- for (i = mem->nr_ranges - 1; i >= j; i--)
- mem->ranges[i + 1] = mem->ranges[i];
- }
-
- mem->ranges[j].start = temp_range.start;
- mem->ranges[j].end = temp_range.end;
- mem->nr_ranges++;
- return 0;
-}
-
-int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
- void **addr, unsigned long *sz)
-{
- Elf64_Ehdr *ehdr;
- Elf64_Phdr *phdr;
- unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz;
- unsigned char *buf;
- unsigned int cpu, i;
- unsigned long long notes_addr;
- unsigned long mstart, mend;
-
- /* extra phdr for vmcoreinfo ELF note */
- nr_phdr = nr_cpus + 1;
- nr_phdr += mem->nr_ranges;
-
- /*
- * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
- * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64).
- * I think this is required by tools like gdb. So same physical
- * memory will be mapped in two ELF headers. One will contain kernel
- * text virtual addresses and other will have __va(physical) addresses.
- */
-
- nr_phdr++;
- elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr);
- elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN);
-
- buf = vzalloc(elf_sz);
- if (!buf)
- return -ENOMEM;
-
- ehdr = (Elf64_Ehdr *)buf;
- phdr = (Elf64_Phdr *)(ehdr + 1);
- memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
- ehdr->e_ident[EI_CLASS] = ELFCLASS64;
- ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
- ehdr->e_ident[EI_VERSION] = EV_CURRENT;
- ehdr->e_ident[EI_OSABI] = ELF_OSABI;
- memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD);
- ehdr->e_type = ET_CORE;
- ehdr->e_machine = ELF_ARCH;
- ehdr->e_version = EV_CURRENT;
- ehdr->e_phoff = sizeof(Elf64_Ehdr);
- ehdr->e_ehsize = sizeof(Elf64_Ehdr);
- ehdr->e_phentsize = sizeof(Elf64_Phdr);
-
- /* Prepare one phdr of type PT_NOTE for each present CPU */
- for_each_present_cpu(cpu) {
- phdr->p_type = PT_NOTE;
- notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
- phdr->p_offset = phdr->p_paddr = notes_addr;
- phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t);
- (ehdr->e_phnum)++;
- phdr++;
- }
-
- /* Prepare one PT_NOTE header for vmcoreinfo */
- phdr->p_type = PT_NOTE;
- phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
- phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
- (ehdr->e_phnum)++;
- phdr++;
-
- /* Prepare PT_LOAD type program header for kernel text region */
- if (need_kernel_map) {
- phdr->p_type = PT_LOAD;
- phdr->p_flags = PF_R|PF_W|PF_X;
- phdr->p_vaddr = (unsigned long) _text;
- phdr->p_filesz = phdr->p_memsz = _end - _text;
- phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
- ehdr->e_phnum++;
- phdr++;
- }
-
- /* Go through all the ranges in mem->ranges[] and prepare phdr */
- for (i = 0; i < mem->nr_ranges; i++) {
- mstart = mem->ranges[i].start;
- mend = mem->ranges[i].end;
-
- phdr->p_type = PT_LOAD;
- phdr->p_flags = PF_R|PF_W|PF_X;
- phdr->p_offset = mstart;
-
- phdr->p_paddr = mstart;
- phdr->p_vaddr = (unsigned long) __va(mstart);
- phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
- phdr->p_align = 0;
- ehdr->e_phnum++;
- pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
- phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
- ehdr->e_phnum, phdr->p_offset);
- phdr++;
- }
-
- *addr = buf;
- *sz = elf_sz;
- return 0;
-}
+#endif /* CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ca385b61d546..0c6185aefaef 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2232,8 +2232,7 @@ int register_kretprobe(struct kretprobe *rp)
return -ENOMEM;
for (i = 0; i < rp->maxactive; i++) {
- inst = kzalloc(sizeof(struct kretprobe_instance) +
- rp->data_size, GFP_KERNEL);
+ inst = kzalloc(struct_size(inst, data, rp->data_size), GFP_KERNEL);
if (inst == NULL) {
rethook_free(rp->rh);
rp->rh = NULL;
@@ -2256,8 +2255,7 @@ int register_kretprobe(struct kretprobe *rp)
rp->rph->rp = rp;
for (i = 0; i < rp->maxactive; i++) {
- inst = kzalloc(sizeof(struct kretprobe_instance) +
- rp->data_size, GFP_KERNEL);
+ inst = kzalloc(struct_size(inst, data, rp->data_size), GFP_KERNEL);
if (inst == NULL) {
refcount_set(&rp->rph->ref, i);
free_rp_inst(rp);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index aad7a3bfd846..1d4bc493b2f4 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -165,6 +165,18 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
}
KERNEL_ATTR_RO(vmcoreinfo);
+#ifdef CONFIG_CRASH_HOTPLUG
+static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ unsigned int sz = crash_get_elfcorehdr_size();
+
+ return sysfs_emit(buf, "%u\n", sz);
+}
+KERNEL_ATTR_RO(crash_elfcorehdr_size);
+
+#endif
+
#endif /* CONFIG_CRASH_CORE */
/* whether file capabilities are enabled */
@@ -255,6 +267,9 @@ static struct attribute * kernel_attrs[] = {
#endif
#ifdef CONFIG_CRASH_CORE
&vmcoreinfo_attr.attr,
+#ifdef CONFIG_CRASH_HOTPLUG
+ &crash_elfcorehdr_size_attr.attr,
+#endif
#endif
#ifndef CONFIG_TINY_RCU
&rcu_expedited_attr.attr,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4fff7df17a68..1eea53050bab 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -159,11 +159,10 @@ bool kthread_should_stop(void)
}
EXPORT_SYMBOL(kthread_should_stop);
-bool __kthread_should_park(struct task_struct *k)
+static bool __kthread_should_park(struct task_struct *k)
{
return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags);
}
-EXPORT_SYMBOL_GPL(__kthread_should_park);
/**
* kthread_should_park - should this kthread park now?
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 111607d91489..e85b5ad3e206 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -819,34 +819,26 @@ static int very_verbose(struct lock_class *class)
* Is this the address of a static object:
*/
#ifdef __KERNEL__
-/*
- * Check if an address is part of freed initmem. After initmem is freed,
- * memory can be allocated from it, and such allocations would then have
- * addresses within the range [_stext, _end].
- */
-#ifndef arch_is_kernel_initmem_freed
-static int arch_is_kernel_initmem_freed(unsigned long addr)
-{
- if (system_state < SYSTEM_FREEING_INITMEM)
- return 0;
-
- return init_section_contains((void *)addr, 1);
-}
-#endif
-
static int static_obj(const void *obj)
{
- unsigned long start = (unsigned long) &_stext,
- end = (unsigned long) &_end,
- addr = (unsigned long) obj;
+ unsigned long addr = (unsigned long) obj;
- if (arch_is_kernel_initmem_freed(addr))
- return 0;
+ if (is_kernel_core_data(addr))
+ return 1;
+
+ /*
+ * keys are allowed in the __ro_after_init section.
+ */
+ if (is_kernel_rodata(addr))
+ return 1;
/*
- * static variable?
+ * in initdata section and used during bootup only?
+ * NOTE: On some platforms the initdata section is
+ * outside of the _stext ... _end range.
*/
- if ((addr >= start) && (addr < end))
+ if (system_state < SYSTEM_FREEING_INITMEM &&
+ init_section_contains((void *)addr, 1))
return 1;
/*
diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c
index 8a5d6d63b06c..87440f714c0c 100644
--- a/kernel/module/decompress.c
+++ b/kernel/module/decompress.c
@@ -241,7 +241,7 @@ static ssize_t module_zstd_decompress(struct load_info *info,
}
wksp_size = zstd_dstream_workspace_bound(header.windowSize);
- wksp = kmalloc(wksp_size, GFP_KERNEL);
+ wksp = vmalloc(wksp_size);
if (!wksp) {
retval = -ENOMEM;
goto out;
@@ -284,7 +284,7 @@ static ssize_t module_zstd_decompress(struct load_info *info,
retval = new_size;
out:
- kfree(wksp);
+ vfree(wksp);
return retval;
}
#else
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 59b1d067e528..98fedfdb8db5 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1295,12 +1295,20 @@ void *__symbol_get(const char *symbol)
};
preempt_disable();
- if (!find_symbol(&fsa) || strong_try_module_get(fsa.owner)) {
- preempt_enable();
- return NULL;
+ if (!find_symbol(&fsa))
+ goto fail;
+ if (fsa.license != GPL_ONLY) {
+ pr_warn("failing symbol_get of non-GPLONLY symbol %s.\n",
+ symbol);
+ goto fail;
}
+ if (strong_try_module_get(fsa.owner))
+ goto fail;
preempt_enable();
return (void *)kernel_symbol_value(fsa.sym);
+fail:
+ preempt_enable();
+ return NULL;
}
EXPORT_SYMBOL_GPL(__symbol_get);
@@ -1484,7 +1492,7 @@ long module_get_offset_and_type(struct module *mod, enum mod_mem_type type,
return offset | mask;
}
-static bool module_init_layout_section(const char *sname)
+bool module_init_layout_section(const char *sname)
{
#ifndef CONFIG_MODULE_UNLOAD
if (module_exit_section(sname))
diff --git a/kernel/panic.c b/kernel/panic.c
index 10effe40a3fa..07239d4ad81e 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -216,7 +216,7 @@ static void panic_print_sys_info(bool console_flush)
show_state();
if (panic_print & PANIC_PRINT_MEM_INFO)
- show_mem(0, NULL);
+ show_mem();
if (panic_print & PANIC_PRINT_TIMER_INFO)
sysrq_timer_list_show();
diff --git a/kernel/params.c b/kernel/params.c
index 07d01f6ce9a2..2d4a0564697e 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -331,7 +331,7 @@ EXPORT_SYMBOL(param_ops_bool);
int param_set_bool_enable_only(const char *val, const struct kernel_param *kp)
{
- int err = 0;
+ int err;
bool new_value;
bool orig_value = *(bool *)kp->arg;
struct kernel_param dummy_kp = *kp;
diff --git a/kernel/pid.c b/kernel/pid.c
index 6a1d23a11026..fee14a4486a3 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -83,6 +83,9 @@ struct pid_namespace init_pid_ns = {
#ifdef CONFIG_PID_NS
.ns.ops = &pidns_operations,
#endif
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
+ .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
+#endif
};
EXPORT_SYMBOL_GPL(init_pid_ns);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 0bf44afe04dd..619972c78774 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -110,9 +110,9 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
ns->pid_allocated = PIDNS_ADDING;
-
- initialize_memfd_noexec_scope(ns);
-
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
+ ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
+#endif
return ns;
out_free_idr:
diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h
index b26e027fc9cd..2ee41a3a1dfd 100644
--- a/kernel/pid_sysctl.h
+++ b/kernel/pid_sysctl.h
@@ -5,33 +5,30 @@
#include <linux/pid_namespace.h>
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
-static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns)
-{
- ns->memfd_noexec_scope =
- task_active_pid_ns(current)->memfd_noexec_scope;
-}
-
static int pid_mfd_noexec_dointvec_minmax(struct ctl_table *table,
int write, void *buf, size_t *lenp, loff_t *ppos)
{
struct pid_namespace *ns = task_active_pid_ns(current);
struct ctl_table table_copy;
+ int err, scope, parent_scope;
if (write && !ns_capable(ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
table_copy = *table;
- if (ns != &init_pid_ns)
- table_copy.data = &ns->memfd_noexec_scope;
- /*
- * set minimum to current value, the effect is only bigger
- * value is accepted.
- */
- if (*(int *)table_copy.data > *(int *)table_copy.extra1)
- table_copy.extra1 = table_copy.data;
+ /* You cannot set a lower enforcement value than your parent. */
+ parent_scope = pidns_memfd_noexec_scope(ns->parent);
+ /* Equivalent to pidns_memfd_noexec_scope(ns). */
+ scope = max(READ_ONCE(ns->memfd_noexec_scope), parent_scope);
+
+ table_copy.data = &scope;
+ table_copy.extra1 = &parent_scope;
- return proc_dointvec_minmax(&table_copy, write, buf, lenp, ppos);
+ err = proc_dointvec_minmax(&table_copy, write, buf, lenp, ppos);
+ if (!err && write)
+ WRITE_ONCE(ns->memfd_noexec_scope, scope);
+ return err;
}
static struct ctl_table pid_ns_ctl_table_vm[] = {
@@ -51,7 +48,6 @@ static inline void register_pid_ns_sysctl_table_vm(void)
register_sysctl("vm", pid_ns_ctl_table_vm);
}
#else
-static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns) {}
static inline void register_pid_ns_sysctl_table_vm(void) {}
#endif
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 562aa0e450ed..1f306f158696 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -23,7 +23,7 @@ static void do_poweroff(struct work_struct *dummy)
static DECLARE_WORK(poweroff_work, do_poweroff);
-static void handle_poweroff(int key)
+static void handle_poweroff(u8 key)
{
/* run sysrq poweroff on boot cpu */
schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work);
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 2a17704136f1..7d4979d5c3ce 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -103,3 +103,5 @@ struct printk_message {
u64 seq;
unsigned long dropped;
};
+
+bool other_cpu_in_panic(void);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 357a4d18f638..96fc38cb2e84 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -88,7 +88,7 @@ EXPORT_SYMBOL(oops_in_progress);
static DEFINE_MUTEX(console_mutex);
/*
- * console_sem protects updates to console->seq and console_suspended,
+ * console_sem protects updates to console->seq
* and also provides serialization for console printing.
*/
static DEFINE_SEMAPHORE(console_sem, 1);
@@ -361,7 +361,7 @@ static bool panic_in_progress(void)
* paths in the console code where we end up in places I want
* locked without the console semaphore held).
*/
-static int console_locked, console_suspended;
+static int console_locked;
/*
* Array of consoles built from command line options (console=)
@@ -538,12 +538,14 @@ char *log_buf_addr_get(void)
{
return log_buf;
}
+EXPORT_SYMBOL_GPL(log_buf_addr_get);
/* Return log buffer size */
u32 log_buf_len_get(void)
{
return log_buf_len;
}
+EXPORT_SYMBOL_GPL(log_buf_len_get);
/*
* Define how much of the log buffer we could take at maximum. The value
@@ -2308,7 +2310,11 @@ asmlinkage int vprintk_emit(int facility, int level,
preempt_enable();
}
- wake_up_klogd();
+ if (in_sched)
+ defer_console_output();
+ else
+ wake_up_klogd();
+
return printed_len;
}
EXPORT_SYMBOL(vprintk_emit);
@@ -2547,22 +2553,46 @@ MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to hig
*/
void suspend_console(void)
{
+ struct console *con;
+
if (!console_suspend_enabled)
return;
pr_info("Suspending console(s) (use no_console_suspend to debug)\n");
pr_flush(1000, true);
- console_lock();
- console_suspended = 1;
- up_console_sem();
+
+ console_list_lock();
+ for_each_console(con)
+ console_srcu_write_flags(con, con->flags | CON_SUSPENDED);
+ console_list_unlock();
+
+ /*
+ * Ensure that all SRCU list walks have completed. All printing
+ * contexts must be able to see that they are suspended so that it
+ * is guaranteed that all printing has stopped when this function
+ * completes.
+ */
+ synchronize_srcu(&console_srcu);
}
void resume_console(void)
{
+ struct console *con;
+
if (!console_suspend_enabled)
return;
- down_console_sem();
- console_suspended = 0;
- console_unlock();
+
+ console_list_lock();
+ for_each_console(con)
+ console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED);
+ console_list_unlock();
+
+ /*
+ * Ensure that all SRCU list walks have completed. All printing
+ * contexts must be able to see they are no longer suspended so
+ * that they are guaranteed to wake up and resume printing.
+ */
+ synchronize_srcu(&console_srcu);
+
pr_flush(1000, true);
}
@@ -2585,6 +2615,26 @@ static int console_cpu_notify(unsigned int cpu)
return 0;
}
+/*
+ * Return true if a panic is in progress on a remote CPU.
+ *
+ * On true, the local CPU should immediately release any printing resources
+ * that may be needed by the panic CPU.
+ */
+bool other_cpu_in_panic(void)
+{
+ if (!panic_in_progress())
+ return false;
+
+ /*
+ * We can use raw_smp_processor_id() here because it is impossible for
+ * the task to be migrated to the panic_cpu, or away from it. If
+ * panic_cpu has already been set, and we're not currently executing on
+ * that CPU, then we never will be.
+ */
+ return atomic_read(&panic_cpu) != raw_smp_processor_id();
+}
+
/**
* console_lock - block the console subsystem from printing
*
@@ -2597,9 +2647,11 @@ void console_lock(void)
{
might_sleep();
+ /* On panic, the console_lock must be left to the panic cpu. */
+ while (other_cpu_in_panic())
+ msleep(1000);
+
down_console_sem();
- if (console_suspended)
- return;
console_locked = 1;
console_may_schedule = 1;
}
@@ -2615,12 +2667,11 @@ EXPORT_SYMBOL(console_lock);
*/
int console_trylock(void)
{
- if (down_trylock_console_sem())
+ /* On panic, the console_lock must be left to the panic cpu. */
+ if (other_cpu_in_panic())
return 0;
- if (console_suspended) {
- up_console_sem();
+ if (down_trylock_console_sem())
return 0;
- }
console_locked = 1;
console_may_schedule = 0;
return 1;
@@ -2634,25 +2685,6 @@ int is_console_locked(void)
EXPORT_SYMBOL(is_console_locked);
/*
- * Return true when this CPU should unlock console_sem without pushing all
- * messages to the console. This reduces the chance that the console is
- * locked when the panic CPU tries to use it.
- */
-static bool abandon_console_lock_in_panic(void)
-{
- if (!panic_in_progress())
- return false;
-
- /*
- * We can use raw_smp_processor_id() here because it is impossible for
- * the task to be migrated to the panic_cpu, or away from it. If
- * panic_cpu has already been set, and we're not currently executing on
- * that CPU, then we never will be.
- */
- return atomic_read(&panic_cpu) != raw_smp_processor_id();
-}
-
-/*
* Check if the given console is currently capable and allowed to print
* records.
*
@@ -2665,6 +2697,9 @@ static inline bool console_is_usable(struct console *con)
if (!(flags & CON_ENABLED))
return false;
+ if ((flags & CON_SUSPENDED))
+ return false;
+
if (!con->write)
return false;
@@ -2948,7 +2983,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
any_progress = true;
/* Allow panic_cpu to take over the consoles safely. */
- if (abandon_console_lock_in_panic())
+ if (other_cpu_in_panic())
goto abandon;
if (do_cond_resched)
@@ -2983,11 +3018,6 @@ void console_unlock(void)
bool flushed;
u64 next_seq;
- if (console_suspended) {
- up_console_sem();
- return;
- }
-
/*
* Console drivers are called with interrupts disabled, so
* @console_may_schedule should be cleared before; however, we may
@@ -3045,10 +3075,28 @@ EXPORT_SYMBOL(console_conditional_schedule);
void console_unblank(void)
{
+ bool found_unblank = false;
struct console *c;
int cookie;
/*
+ * First check if there are any consoles implementing the unblank()
+ * callback. If not, there is no reason to continue and take the
+ * console lock, which in particular can be dangerous if
+ * @oops_in_progress is set.
+ */
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) {
+ found_unblank = true;
+ break;
+ }
+ }
+ console_srcu_read_unlock(cookie);
+ if (!found_unblank)
+ return;
+
+ /*
* Stop console printing because the unblank() callback may
* assume the console is not within its write() callback.
*
@@ -3056,6 +3104,16 @@ void console_unblank(void)
* In that case, attempt a trylock as best-effort.
*/
if (oops_in_progress) {
+ /* Semaphores are not NMI-safe. */
+ if (in_nmi())
+ return;
+
+ /*
+ * Attempting to trylock the console lock can deadlock
+ * if another CPU was stopped while modifying the
+ * semaphore. "Hope and pray" that this is not the
+ * current situation.
+ */
if (down_trylock_console_sem() != 0)
return;
} else
@@ -3085,14 +3143,24 @@ void console_unblank(void)
*/
void console_flush_on_panic(enum con_flush_mode mode)
{
+ bool handover;
+ u64 next_seq;
+
/*
- * If someone else is holding the console lock, trylock will fail
- * and may_schedule may be set. Ignore and proceed to unlock so
- * that messages are flushed out. As this can be called from any
- * context and we don't want to get preempted while flushing,
- * ensure may_schedule is cleared.
+ * Ignore the console lock and flush out the messages. Attempting a
+ * trylock would not be useful because:
+ *
+ * - if it is contended, it must be ignored anyway
+ * - console_lock() and console_trylock() block and fail
+ * respectively in panic for non-panic CPUs
+ * - semaphores are not NMI-safe
+ */
+
+ /*
+ * If another context is holding the console lock,
+ * @console_may_schedule might be set. Clear it so that
+ * this context does not call cond_resched() while flushing.
*/
- console_trylock();
console_may_schedule = 0;
if (mode == CONSOLE_REPLAY_ALL) {
@@ -3105,15 +3173,15 @@ void console_flush_on_panic(enum con_flush_mode mode)
cookie = console_srcu_read_lock();
for_each_console_srcu(c) {
/*
- * If the above console_trylock() failed, this is an
- * unsynchronized assignment. But in that case, the
+ * This is an unsynchronized assignment, but the
* kernel is in "hope and pray" mode anyway.
*/
c->seq = seq;
}
console_srcu_read_unlock(cookie);
}
- console_unlock();
+
+ console_flush_all(false, &next_seq, &handover);
}
/*
@@ -3679,8 +3747,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
/*
* Hold the console_lock to guarantee safe access to
- * console->seq and to prevent changes to @console_suspended
- * until all consoles have been processed.
+ * console->seq.
*/
console_lock();
@@ -3688,6 +3755,11 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
for_each_console_srcu(c) {
if (con && con != c)
continue;
+ /*
+ * If consoles are not usable, it cannot be expected
+ * that they make forward progress, so only increment
+ * @diff for usable consoles.
+ */
if (!console_is_usable(c))
continue;
printk_seq = c->seq;
@@ -3696,18 +3768,12 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
}
console_srcu_read_unlock(cookie);
- /*
- * If consoles are suspended, it cannot be expected that they
- * make forward progress, so timeout immediately. @diff is
- * still used to return a valid flush status.
- */
- if (console_suspended)
- remaining = 0;
- else if (diff != last_diff && reset_on_progress)
+ if (diff != last_diff && reset_on_progress)
remaining = timeout_ms;
console_unlock();
+ /* Note: @diff is 0 if there are no usable consoles. */
if (diff == 0 || remaining == 0)
break;
@@ -3741,7 +3807,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
* printer has been seen to make some forward progress.
*
* Context: Process context. May sleep while acquiring console lock.
- * Return: true if all enabled printers are caught up.
+ * Return: true if all usable printers are caught up.
*/
static bool pr_flush(int timeout_ms, bool reset_on_progress)
{
@@ -3798,11 +3864,33 @@ static void __wake_up_klogd(int val)
preempt_enable();
}
+/**
+ * wake_up_klogd - Wake kernel logging daemon
+ *
+ * Use this function when new records have been added to the ringbuffer
+ * and the console printing of those records has already occurred or is
+ * known to be handled by some other context. This function will only
+ * wake the logging daemon.
+ *
+ * Context: Any context.
+ */
void wake_up_klogd(void)
{
__wake_up_klogd(PRINTK_PENDING_WAKEUP);
}
+/**
+ * defer_console_output - Wake kernel logging daemon and trigger
+ * console printing in a deferred context
+ *
+ * Use this function when new records have been added to the ringbuffer,
+ * this context is responsible for console printing those records, but
+ * the current context is not allowed to perform the console printing.
+ * Trigger an irq_work context to perform the console printing. This
+ * function also wakes the logging daemon.
+ *
+ * Context: Any context.
+ */
void defer_console_output(void)
{
/*
@@ -3819,12 +3907,7 @@ void printk_trigger_flush(void)
int vprintk_deferred(const char *fmt, va_list args)
{
- int r;
-
- r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args);
- defer_console_output();
-
- return r;
+ return vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args);
}
int _printk_deferred(const char *fmt, ...)
diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c
index 2dc4d5a1f1ff..fde338606ce8 100644
--- a/kernel/printk/printk_ringbuffer.c
+++ b/kernel/printk/printk_ringbuffer.c
@@ -1735,7 +1735,7 @@ static bool copy_data(struct prb_data_ring *data_ring,
if (!buf || !buf_size)
return true;
- data_size = min_t(u16, buf_size, len);
+ data_size = min_t(unsigned int, buf_size, len);
memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */
return true;
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index ef0f9a2044da..6d10927a07d8 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -38,13 +38,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
* Use the main logbuf even in NMI. But avoid calling console
* drivers that might have their own locks.
*/
- if (this_cpu_read(printk_context) || in_nmi()) {
- int len;
-
- len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
- defer_console_output();
- return len;
- }
+ if (this_cpu_read(printk_context) || in_nmi())
+ return vprintk_deferred(fmt, args);
/* No obstacles. */
return vprintk_default(fmt, args);
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index b10b8349bb2a..6f06dc12904a 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -1035,7 +1035,7 @@ static bool sysrq_rcu;
module_param(sysrq_rcu, bool, 0444);
/* Dump grace-period-request information due to commandeered sysrq. */
-static void sysrq_show_rcu(int key)
+static void sysrq_show_rcu(u8 key)
{
show_rcu_gp_kthreads();
}
diff --git a/kernel/relay.c b/kernel/relay.c
index a80fa01042e9..83fe0325cde1 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -375,7 +375,7 @@ static struct dentry *relay_create_buf_file(struct rchan *chan,
*/
static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
{
- struct rchan_buf *buf = NULL;
+ struct rchan_buf *buf;
struct dentry *dentry;
if (chan->is_global)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 911d0063763c..8dbff6e7ad4f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -699,7 +699,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
*
* XXX could add max_slice to the augmented data to track this.
*/
-void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
s64 lag, limit;
diff --git a/kernel/signal.c b/kernel/signal.c
index 128e9bb3d1a2..09019017d669 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,6 +22,7 @@
#include <linux/sched/cputime.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/mm.h>
#include <linux/proc_fs.h>
#include <linux/tty.h>
#include <linux/binfmts.h>
@@ -1260,7 +1261,17 @@ int send_signal_locked(int sig, struct kernel_siginfo *info,
static void print_fatal_signal(int signr)
{
struct pt_regs *regs = task_pt_regs(current);
- pr_info("potentially unexpected fatal signal %d.\n", signr);
+ struct file *exe_file;
+
+ exe_file = get_task_exe_file(current);
+ if (exe_file) {
+ pr_info("%pD: %s: potentially unexpected fatal signal %d.\n",
+ exe_file, current->comm, signr);
+ fput(exe_file);
+ } else {
+ pr_info("%s: potentially unexpected fatal signal %d.\n",
+ current->comm, signr);
+ }
#if defined(__i386__) && !defined(__arch_um__)
pr_info("code at %08lx: ", regs->ip);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 781de7cc6a4e..e137c1385c56 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -274,6 +274,7 @@ COND_SYSCALL(vm86old);
COND_SYSCALL(modify_ldt);
COND_SYSCALL(vm86);
COND_SYSCALL(kexec_file_load);
+COND_SYSCALL(map_shadow_stack);
/* s390 */
COND_SYSCALL(s390_pci_mmio_read);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 4df14db4da49..87015e9deacc 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1045,7 +1045,7 @@ static bool report_idle_softirq(void)
return false;
/* On RT, softirqs handling may be waiting on some lock */
- if (!local_bh_blocked())
+ if (local_bh_blocked())
return false;
pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n",
diff --git a/kernel/time/time.c b/kernel/time/time.c
index f4198af60fee..642647f5046b 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -365,11 +365,14 @@ SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp)
}
#endif
-/*
- * Convert jiffies to milliseconds and back.
+/**
+ * jiffies_to_msecs - Convert jiffies to milliseconds
+ * @j: jiffies value
*
* Avoid unnecessary multiplications/divisions in the
- * two most common HZ cases:
+ * two most common HZ cases.
+ *
+ * Return: milliseconds value
*/
unsigned int jiffies_to_msecs(const unsigned long j)
{
@@ -388,6 +391,12 @@ unsigned int jiffies_to_msecs(const unsigned long j)
}
EXPORT_SYMBOL(jiffies_to_msecs);
+/**
+ * jiffies_to_usecs - Convert jiffies to microseconds
+ * @j: jiffies value
+ *
+ * Return: microseconds value
+ */
unsigned int jiffies_to_usecs(const unsigned long j)
{
/*
@@ -408,8 +417,15 @@ unsigned int jiffies_to_usecs(const unsigned long j)
}
EXPORT_SYMBOL(jiffies_to_usecs);
-/*
+/**
* mktime64 - Converts date to seconds.
+ * @year0: year to convert
+ * @mon0: month to convert
+ * @day: day to convert
+ * @hour: hour to convert
+ * @min: minute to convert
+ * @sec: second to convert
+ *
* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
* Assumes input in normal date format, i.e. 1980-12-31 23:59:59
* => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
@@ -427,6 +443,8 @@ EXPORT_SYMBOL(jiffies_to_usecs);
*
* An encoding of midnight at the end of the day as 24:00:00 - ie. midnight
* tomorrow - (allowable under ISO 8601) is supported.
+ *
+ * Return: seconds since the epoch time for the given input date
*/
time64_t mktime64(const unsigned int year0, const unsigned int mon0,
const unsigned int day, const unsigned int hour,
@@ -471,8 +489,7 @@ EXPORT_SYMBOL(ns_to_kernel_old_timeval);
* Set seconds and nanoseconds field of a timespec variable and
* normalize to the timespec storage format
*
- * Note: The tv_nsec part is always in the range of
- * 0 <= tv_nsec < NSEC_PER_SEC
+ * Note: The tv_nsec part is always in the range of 0 <= tv_nsec < NSEC_PER_SEC.
* For negative values only the tv_sec field is negative !
*/
void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec)
@@ -501,7 +518,7 @@ EXPORT_SYMBOL(set_normalized_timespec64);
* ns_to_timespec64 - Convert nanoseconds to timespec64
* @nsec: the nanoseconds value to be converted
*
- * Returns the timespec64 representation of the nsec parameter.
+ * Return: the timespec64 representation of the nsec parameter.
*/
struct timespec64 ns_to_timespec64(s64 nsec)
{
@@ -548,6 +565,8 @@ EXPORT_SYMBOL(ns_to_timespec64);
* runtime.
* The _msecs_to_jiffies helpers are the HZ dependent conversion
* routines found in include/linux/jiffies.h
+ *
+ * Return: jiffies value
*/
unsigned long __msecs_to_jiffies(const unsigned int m)
{
@@ -560,6 +579,12 @@ unsigned long __msecs_to_jiffies(const unsigned int m)
}
EXPORT_SYMBOL(__msecs_to_jiffies);
+/**
+ * __usecs_to_jiffies: - convert microseconds to jiffies
+ * @u: time in milliseconds
+ *
+ * Return: jiffies value
+ */
unsigned long __usecs_to_jiffies(const unsigned int u)
{
if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
@@ -568,7 +593,10 @@ unsigned long __usecs_to_jiffies(const unsigned int u)
}
EXPORT_SYMBOL(__usecs_to_jiffies);
-/*
+/**
+ * timespec64_to_jiffies - convert a timespec64 value to jiffies
+ * @value: pointer to &struct timespec64
+ *
* The TICK_NSEC - 1 rounds up the value to the next resolution. Note
* that a remainder subtract here would not do the right thing as the
* resolution values don't fall on second boundaries. I.e. the line:
@@ -582,8 +610,9 @@ EXPORT_SYMBOL(__usecs_to_jiffies);
*
* The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
* value to a scaled second value.
+ *
+ * Return: jiffies value
*/
-
unsigned long
timespec64_to_jiffies(const struct timespec64 *value)
{
@@ -601,6 +630,11 @@ timespec64_to_jiffies(const struct timespec64 *value)
}
EXPORT_SYMBOL(timespec64_to_jiffies);
+/**
+ * jiffies_to_timespec64 - convert jiffies value to &struct timespec64
+ * @jiffies: jiffies value
+ * @value: pointer to &struct timespec64
+ */
void
jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value)
{
@@ -618,6 +652,13 @@ EXPORT_SYMBOL(jiffies_to_timespec64);
/*
* Convert jiffies/jiffies_64 to clock_t and back.
*/
+
+/**
+ * jiffies_to_clock_t - Convert jiffies to clock_t
+ * @x: jiffies value
+ *
+ * Return: jiffies converted to clock_t (CLOCKS_PER_SEC)
+ */
clock_t jiffies_to_clock_t(unsigned long x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
@@ -632,6 +673,12 @@ clock_t jiffies_to_clock_t(unsigned long x)
}
EXPORT_SYMBOL(jiffies_to_clock_t);
+/**
+ * clock_t_to_jiffies - Convert clock_t to jiffies
+ * @x: clock_t value
+ *
+ * Return: clock_t value converted to jiffies
+ */
unsigned long clock_t_to_jiffies(unsigned long x)
{
#if (HZ % USER_HZ)==0
@@ -649,6 +696,12 @@ unsigned long clock_t_to_jiffies(unsigned long x)
}
EXPORT_SYMBOL(clock_t_to_jiffies);
+/**
+ * jiffies_64_to_clock_t - Convert jiffies_64 to clock_t
+ * @x: jiffies_64 value
+ *
+ * Return: jiffies_64 value converted to 64-bit "clock_t" (CLOCKS_PER_SEC)
+ */
u64 jiffies_64_to_clock_t(u64 x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
@@ -671,6 +724,12 @@ u64 jiffies_64_to_clock_t(u64 x)
}
EXPORT_SYMBOL(jiffies_64_to_clock_t);
+/**
+ * nsec_to_clock_t - Convert nsec value to clock_t
+ * @x: nsec value
+ *
+ * Return: nsec value converted to 64-bit "clock_t" (CLOCKS_PER_SEC)
+ */
u64 nsec_to_clock_t(u64 x)
{
#if (NSEC_PER_SEC % USER_HZ) == 0
@@ -687,6 +746,12 @@ u64 nsec_to_clock_t(u64 x)
#endif
}
+/**
+ * jiffies64_to_nsecs - Convert jiffies64 to nanoseconds
+ * @j: jiffies64 value
+ *
+ * Return: nanoseconds value
+ */
u64 jiffies64_to_nsecs(u64 j)
{
#if !(NSEC_PER_SEC % HZ)
@@ -697,6 +762,12 @@ u64 jiffies64_to_nsecs(u64 j)
}
EXPORT_SYMBOL(jiffies64_to_nsecs);
+/**
+ * jiffies64_to_msecs - Convert jiffies64 to milliseconds
+ * @j: jiffies64 value
+ *
+ * Return: milliseconds value
+ */
u64 jiffies64_to_msecs(const u64 j)
{
#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
@@ -719,6 +790,8 @@ EXPORT_SYMBOL(jiffies64_to_msecs);
* note:
* NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
* ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
+ *
+ * Return: nsecs converted to jiffies64 value
*/
u64 nsecs_to_jiffies64(u64 n)
{
@@ -750,6 +823,8 @@ EXPORT_SYMBOL(nsecs_to_jiffies64);
* note:
* NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
* ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
+ *
+ * Return: nsecs converted to jiffies value
*/
unsigned long nsecs_to_jiffies(u64 n)
{
@@ -757,10 +832,16 @@ unsigned long nsecs_to_jiffies(u64 n)
}
EXPORT_SYMBOL_GPL(nsecs_to_jiffies);
-/*
- * Add two timespec64 values and do a safety check for overflow.
+/**
+ * timespec64_add_safe - Add two timespec64 values and do a safety check
+ * for overflow.
+ * @lhs: first (left) timespec64 to add
+ * @rhs: second (right) timespec64 to add
+ *
* It's assumed that both values are valid (>= 0).
* And, each timespec64 is in normalized form.
+ *
+ * Return: sum of @lhs + @rhs
*/
struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
const struct timespec64 rhs)
@@ -778,6 +859,15 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
return res;
}
+/**
+ * get_timespec64 - get user's time value into kernel space
+ * @ts: destination &struct timespec64
+ * @uts: user's time value as &struct __kernel_timespec
+ *
+ * Handles compat or 32-bit modes.
+ *
+ * Return: %0 on success or negative errno on error
+ */
int get_timespec64(struct timespec64 *ts,
const struct __kernel_timespec __user *uts)
{
@@ -801,6 +891,14 @@ int get_timespec64(struct timespec64 *ts,
}
EXPORT_SYMBOL_GPL(get_timespec64);
+/**
+ * put_timespec64 - convert timespec64 value to __kernel_timespec format and
+ * copy the latter to userspace
+ * @ts: input &struct timespec64
+ * @uts: user's &struct __kernel_timespec
+ *
+ * Return: %0 on success or negative errno on error
+ */
int put_timespec64(const struct timespec64 *ts,
struct __kernel_timespec __user *uts)
{
@@ -839,6 +937,15 @@ static int __put_old_timespec32(const struct timespec64 *ts64,
return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0;
}
+/**
+ * get_old_timespec32 - get user's old-format time value into kernel space
+ * @ts: destination &struct timespec64
+ * @uts: user's old-format time value (&struct old_timespec32)
+ *
+ * Handles X86_X32_ABI compatibility conversion.
+ *
+ * Return: %0 on success or negative errno on error
+ */
int get_old_timespec32(struct timespec64 *ts, const void __user *uts)
{
if (COMPAT_USE_64BIT_TIME)
@@ -848,6 +955,16 @@ int get_old_timespec32(struct timespec64 *ts, const void __user *uts)
}
EXPORT_SYMBOL_GPL(get_old_timespec32);
+/**
+ * put_old_timespec32 - convert timespec64 value to &struct old_timespec32 and
+ * copy the latter to userspace
+ * @ts: input &struct timespec64
+ * @uts: user's &struct old_timespec32
+ *
+ * Handles X86_X32_ABI compatibility conversion.
+ *
+ * Return: %0 on success or negative errno on error
+ */
int put_old_timespec32(const struct timespec64 *ts, void __user *uts)
{
if (COMPAT_USE_64BIT_TIME)
@@ -857,6 +974,13 @@ int put_old_timespec32(const struct timespec64 *ts, void __user *uts)
}
EXPORT_SYMBOL_GPL(put_old_timespec32);
+/**
+ * get_itimerspec64 - get user's &struct __kernel_itimerspec into kernel space
+ * @it: destination &struct itimerspec64
+ * @uit: user's &struct __kernel_itimerspec
+ *
+ * Return: %0 on success or negative errno on error
+ */
int get_itimerspec64(struct itimerspec64 *it,
const struct __kernel_itimerspec __user *uit)
{
@@ -872,6 +996,14 @@ int get_itimerspec64(struct itimerspec64 *it,
}
EXPORT_SYMBOL_GPL(get_itimerspec64);
+/**
+ * put_itimerspec64 - convert &struct itimerspec64 to __kernel_itimerspec format
+ * and copy the latter to userspace
+ * @it: input &struct itimerspec64
+ * @uit: user's &struct __kernel_itimerspec
+ *
+ * Return: %0 on success or negative errno on error
+ */
int put_itimerspec64(const struct itimerspec64 *it,
struct __kernel_itimerspec __user *uit)
{
@@ -887,6 +1019,13 @@ int put_itimerspec64(const struct itimerspec64 *it,
}
EXPORT_SYMBOL_GPL(put_itimerspec64);
+/**
+ * get_old_itimerspec32 - get user's &struct old_itimerspec32 into kernel space
+ * @its: destination &struct itimerspec64
+ * @uits: user's &struct old_itimerspec32
+ *
+ * Return: %0 on success or negative errno on error
+ */
int get_old_itimerspec32(struct itimerspec64 *its,
const struct old_itimerspec32 __user *uits)
{
@@ -898,6 +1037,14 @@ int get_old_itimerspec32(struct itimerspec64 *its,
}
EXPORT_SYMBOL_GPL(get_old_itimerspec32);
+/**
+ * put_old_itimerspec32 - convert &struct itimerspec64 to &struct
+ * old_itimerspec32 and copy the latter to userspace
+ * @its: input &struct itimerspec64
+ * @uits: user's &struct old_itimerspec32
+ *
+ * Return: %0 on success or negative errno on error
+ */
int put_old_itimerspec32(const struct itimerspec64 *its,
struct old_itimerspec32 __user *uits)
{
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 64b61f67a403..057cd975d014 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -99,6 +99,7 @@ obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
endif
obj-$(CONFIG_DYNAMIC_EVENTS) += trace_dynevent.o
obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
+obj-$(CONFIG_PROBE_EVENTS_BTF_ARGS) += trace_btf.o
obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o
obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o
obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 05c0024815bf..8de8bec5f366 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6779,8 +6779,7 @@ void ftrace_release_mod(struct module *mod)
last_pg = &ftrace_pages_start;
for (pg = ftrace_pages_start; pg; pg = *last_pg) {
rec = &pg->records[0];
- if (within_module_core(rec->ip, mod) ||
- within_module_init(rec->ip, mod)) {
+ if (within_module(rec->ip, mod)) {
/*
* As core pages are first, the first
* page should never be a module page.
@@ -6852,8 +6851,7 @@ void ftrace_module_enable(struct module *mod)
* not part of this module, then skip this pg,
* which the "break" will do.
*/
- if (!within_module_core(rec->ip, mod) &&
- !within_module_init(rec->ip, mod))
+ if (!within_module(rec->ip, mod))
break;
/* Weak functions should still be ignored */
@@ -7142,9 +7140,7 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
struct dyn_ftrace key;
struct ftrace_mod_map *mod_map = NULL;
struct ftrace_init_func *func, *func_next;
- struct list_head clear_hash;
-
- INIT_LIST_HEAD(&clear_hash);
+ LIST_HEAD(clear_hash);
key.ip = start;
key.flags = end; /* overload flags, as it is unsigned long */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 52dea5dd5362..78502d4c7214 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -692,10 +692,7 @@ static void rb_time_set(rb_time_t *t, u64 val)
static inline bool
rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
{
- unsigned long ret;
-
- ret = local_cmpxchg(l, expect, set);
- return ret == expect;
+ return local_try_cmpxchg(l, &expect, set);
}
static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
@@ -752,9 +749,7 @@ static void rb_time_set(rb_time_t *t, u64 val)
static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
{
- u64 val;
- val = local64_cmpxchg(&t->time, expect, set);
- return val == expect;
+ return local64_try_cmpxchg(&t->time, &expect, set);
}
#endif
@@ -1494,14 +1489,11 @@ static bool rb_head_page_replace(struct buffer_page *old,
{
unsigned long *ptr = (unsigned long *)&old->list.prev->next;
unsigned long val;
- unsigned long ret;
val = *ptr & ~RB_FLAG_MASK;
val |= RB_PAGE_HEAD;
- ret = cmpxchg(ptr, val, (unsigned long)&new->list);
-
- return ret == val;
+ return try_cmpxchg(ptr, &val, (unsigned long)&new->list);
}
/*
@@ -3003,7 +2995,6 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
{
unsigned long new_index, old_index;
struct buffer_page *bpage;
- unsigned long index;
unsigned long addr;
u64 write_stamp;
u64 delta;
@@ -3060,8 +3051,9 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
*/
old_index += write_mask;
new_index += write_mask;
- index = local_cmpxchg(&bpage->write, old_index, new_index);
- if (index == old_index) {
+
+ /* caution: old_index gets updated on cmpxchg failure */
+ if (local_try_cmpxchg(&bpage->write, &old_index, new_index)) {
/* update counters */
local_sub(event_length, &cpu_buffer->entries_bytes);
return true;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8e64aaad5361..2b4ded753367 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3119,7 +3119,6 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
struct ftrace_stack *fstack;
struct stack_entry *entry;
int stackidx;
- void *ptr;
/*
* Add one, for this function and the call to save_stack_trace()
@@ -3157,32 +3156,16 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
nr_entries = stack_trace_save(fstack->calls, size, skip);
}
- size = nr_entries * sizeof(unsigned long);
event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
- (sizeof(*entry) - sizeof(entry->caller)) + size,
+ struct_size(entry, caller, nr_entries),
trace_ctx);
if (!event)
goto out;
- ptr = ring_buffer_event_data(event);
- entry = ptr;
-
- /*
- * For backward compatibility reasons, the entry->caller is an
- * array of 8 slots to store the stack. This is also exported
- * to user space. The amount allocated on the ring buffer actually
- * holds enough for the stack specified by nr_entries. This will
- * go into the location of entry->caller. Due to string fortifiers
- * checking the size of the destination of memcpy() it triggers
- * when it detects that size is greater than 8. To hide this from
- * the fortifiers, we use "ptr" and pointer arithmetic to assign caller.
- *
- * The below is really just:
- * memcpy(&entry->caller, fstack->calls, size);
- */
- ptr += offsetof(typeof(*entry), caller);
- memcpy(ptr, fstack->calls, size);
+ entry = ring_buffer_event_data(event);
entry->size = nr_entries;
+ memcpy(&entry->caller, fstack->calls,
+ flex_array_size(entry, caller, nr_entries));
if (!call_filter_check_discard(call, entry, buffer, event))
__buffer_unlock_commit(buffer, event);
@@ -4206,18 +4189,12 @@ static void *s_start(struct seq_file *m, loff_t *pos)
loff_t l = 0;
int cpu;
- /*
- * copy the tracer to avoid using a global lock all around.
- * iter->trace is a copy of current_trace, the pointer to the
- * name may be used instead of a strcmp(), as iter->trace->name
- * will point to the same string as current_trace->name.
- */
mutex_lock(&trace_types_lock);
- if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) {
+ if (unlikely(tr->current_trace != iter->trace)) {
/* Close iter->trace before switching to the new current tracer */
if (iter->trace->close)
iter->trace->close(iter);
- *iter->trace = *tr->current_trace;
+ iter->trace = tr->current_trace;
/* Reopen the new current tracer */
if (iter->trace->open)
iter->trace->open(iter);
@@ -4829,6 +4806,25 @@ static const struct seq_operations tracer_seq_ops = {
.show = s_show,
};
+/*
+ * Note, as iter itself can be allocated and freed in different
+ * ways, this function is only used to free its content, and not
+ * the iterator itself. The only requirement to all the allocations
+ * is that it must zero all fields (kzalloc), as freeing works with
+ * ethier allocated content or NULL.
+ */
+static void free_trace_iter_content(struct trace_iterator *iter)
+{
+ /* The fmt is either NULL, allocated or points to static_fmt_buf */
+ if (iter->fmt != static_fmt_buf)
+ kfree(iter->fmt);
+
+ kfree(iter->temp);
+ kfree(iter->buffer_iter);
+ mutex_destroy(&iter->mutex);
+ free_cpumask_var(iter->started);
+}
+
static struct trace_iterator *
__tracing_open(struct inode *inode, struct file *file, bool snapshot)
{
@@ -4870,16 +4866,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
iter->fmt = NULL;
iter->fmt_size = 0;
- /*
- * We make a copy of the current tracer to avoid concurrent
- * changes on it while we are reading.
- */
mutex_lock(&trace_types_lock);
- iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL);
- if (!iter->trace)
- goto fail;
-
- *iter->trace = *tr->current_trace;
+ iter->trace = tr->current_trace;
if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
goto fail;
@@ -4944,9 +4932,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
fail:
mutex_unlock(&trace_types_lock);
- kfree(iter->trace);
- kfree(iter->temp);
- kfree(iter->buffer_iter);
+ free_trace_iter_content(iter);
release:
seq_release_private(inode, file);
return ERR_PTR(-ENOMEM);
@@ -5025,12 +5011,7 @@ static int tracing_release(struct inode *inode, struct file *file)
mutex_unlock(&trace_types_lock);
- mutex_destroy(&iter->mutex);
- free_cpumask_var(iter->started);
- kfree(iter->fmt);
- kfree(iter->temp);
- kfree(iter->trace);
- kfree(iter->buffer_iter);
+ free_trace_iter_content(iter);
seq_release_private(inode, file);
return 0;
@@ -5730,7 +5711,8 @@ static const char readme_msg[] =
"\t fetcharg: (%<register>|$<efield>), @<address>, @<symbol>[+|-<offset>],\n"
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS
- "\t $stack<index>, $stack, $retval, $comm, $arg<N>, <argname>\n"
+ "\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
+ "\t <argname>[->field[->field|.field...]],\n"
#else
"\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
#endif
@@ -6318,6 +6300,15 @@ static void set_buffer_entries(struct array_buffer *buf, unsigned long val)
per_cpu_ptr(buf->data, cpu)->entries = val;
}
+static void update_buffer_entries(struct array_buffer *buf, int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ set_buffer_entries(buf, ring_buffer_size(buf->buffer, 0));
+ } else {
+ per_cpu_ptr(buf->data, cpu)->entries = ring_buffer_size(buf->buffer, cpu);
+ }
+}
+
#ifdef CONFIG_TRACER_MAX_TRACE
/* resize @tr's buffer to the size of @size_tr's entries */
static int resize_buffer_duplicate_size(struct array_buffer *trace_buf,
@@ -6396,18 +6387,12 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
return ret;
}
- if (cpu == RING_BUFFER_ALL_CPUS)
- set_buffer_entries(&tr->max_buffer, size);
- else
- per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size;
+ update_buffer_entries(&tr->max_buffer, cpu);
out:
#endif /* CONFIG_TRACER_MAX_TRACE */
- if (cpu == RING_BUFFER_ALL_CPUS)
- set_buffer_entries(&tr->array_buffer, size);
- else
- per_cpu_ptr(tr->array_buffer.data, cpu)->entries = size;
+ update_buffer_entries(&tr->array_buffer, cpu);
return ret;
}
@@ -6825,10 +6810,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
close_pipe_on_cpu(tr, iter->cpu_file);
mutex_unlock(&trace_types_lock);
- free_cpumask_var(iter->started);
- kfree(iter->fmt);
- kfree(iter->temp);
- mutex_destroy(&iter->mutex);
+ free_trace_iter_content(iter);
kfree(iter);
trace_array_put(tr);
@@ -7618,6 +7600,11 @@ out:
return ret;
}
+static void tracing_swap_cpu_buffer(void *tr)
+{
+ update_max_tr_single((struct trace_array *)tr, current, smp_processor_id());
+}
+
static ssize_t
tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
@@ -7676,13 +7663,15 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
ret = tracing_alloc_snapshot_instance(tr);
if (ret < 0)
break;
- local_irq_disable();
/* Now, we're going to swap */
- if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
+ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
+ local_irq_disable();
update_max_tr(tr, current, smp_processor_id(), NULL);
- else
- update_max_tr_single(tr, current, iter->cpu_file);
- local_irq_enable();
+ local_irq_enable();
+ } else {
+ smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer,
+ (void *)tr, 1);
+ }
break;
default:
if (tr->allocated_snapshot) {
@@ -9486,7 +9475,7 @@ static struct trace_array *trace_array_create(const char *name)
if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
goto out_free_tr;
- if (!alloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
goto out_free_tr;
tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
@@ -10431,7 +10420,7 @@ __init static int tracer_alloc_buffers(void)
if (trace_create_savedcmd() < 0)
goto out_free_temp_buffer;
- if (!alloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL))
goto out_free_savedcmd;
/* TODO: make the number of buffers hot pluggable with CPUS */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 73eaec158473..5669dd1f90d9 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -77,6 +77,16 @@ enum trace_type {
#undef __array
#define __array(type, item, size) type item[size];
+/*
+ * For backward compatibility, older user space expects to see the
+ * kernel_stack event with a fixed size caller field. But today the fix
+ * size is ignored by the kernel, and the real structure is dynamic.
+ * Expose to user space: "unsigned long caller[8];" but the real structure
+ * will be "unsigned long caller[] __counted_by(size)"
+ */
+#undef __stack_array
+#define __stack_array(type, item, size, field) type item[] __counted_by(field);
+
#undef __array_desc
#define __array_desc(type, container, item, size)
@@ -596,7 +606,6 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu)
int tracer_init(struct tracer *t, struct trace_array *tr);
int tracing_is_enabled(void);
void tracing_reset_online_cpus(struct array_buffer *buf);
-void tracing_reset_current(int cpu);
void tracing_reset_all_online_cpus(void);
void tracing_reset_all_online_cpus_unlocked(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
@@ -697,7 +706,6 @@ void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos);
void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos);
int trace_pid_show(struct seq_file *m, void *v);
-void trace_free_pid_list(struct trace_pid_list *pid_list);
int trace_pid_write(struct trace_pid_list *filtered_pids,
struct trace_pid_list **new_pid_list,
const char __user *ubuf, size_t cnt);
@@ -1334,7 +1342,7 @@ struct trace_subsystem_dir {
struct list_head list;
struct event_subsystem *subsystem;
struct trace_array *tr;
- struct dentry *entry;
+ struct eventfs_file *ef;
int ref_count;
int nr_events;
};
diff --git a/kernel/trace/trace_btf.c b/kernel/trace/trace_btf.c
new file mode 100644
index 000000000000..ca224d53bfdc
--- /dev/null
+++ b/kernel/trace/trace_btf.c
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/btf.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include "trace_btf.h"
+
+/*
+ * Find a function proto type by name, and return the btf_type with its btf
+ * in *@btf_p. Return NULL if not found.
+ * Note that caller has to call btf_put(*@btf_p) after using the btf_type.
+ */
+const struct btf_type *btf_find_func_proto(const char *func_name, struct btf **btf_p)
+{
+ const struct btf_type *t;
+ s32 id;
+
+ id = bpf_find_btf_id(func_name, BTF_KIND_FUNC, btf_p);
+ if (id < 0)
+ return NULL;
+
+ /* Get BTF_KIND_FUNC type */
+ t = btf_type_by_id(*btf_p, id);
+ if (!t || !btf_type_is_func(t))
+ goto err;
+
+ /* The type of BTF_KIND_FUNC is BTF_KIND_FUNC_PROTO */
+ t = btf_type_by_id(*btf_p, t->type);
+ if (!t || !btf_type_is_func_proto(t))
+ goto err;
+
+ return t;
+err:
+ btf_put(*btf_p);
+ return NULL;
+}
+
+/*
+ * Get function parameter with the number of parameters.
+ * This can return NULL if the function has no parameters.
+ * It can return -EINVAL if the @func_proto is not a function proto type.
+ */
+const struct btf_param *btf_get_func_param(const struct btf_type *func_proto, s32 *nr)
+{
+ if (!btf_type_is_func_proto(func_proto))
+ return ERR_PTR(-EINVAL);
+
+ *nr = btf_type_vlen(func_proto);
+ if (*nr > 0)
+ return (const struct btf_param *)(func_proto + 1);
+ else
+ return NULL;
+}
+
+#define BTF_ANON_STACK_MAX 16
+
+struct btf_anon_stack {
+ u32 tid;
+ u32 offset;
+};
+
+/*
+ * Find a member of data structure/union by name and return it.
+ * Return NULL if not found, or -EINVAL if parameter is invalid.
+ * If the member is an member of anonymous union/structure, the offset
+ * of that anonymous union/structure is stored into @anon_offset. Caller
+ * can calculate the correct offset from the root data structure by
+ * adding anon_offset to the member's offset.
+ */
+const struct btf_member *btf_find_struct_member(struct btf *btf,
+ const struct btf_type *type,
+ const char *member_name,
+ u32 *anon_offset)
+{
+ struct btf_anon_stack *anon_stack;
+ const struct btf_member *member;
+ u32 tid, cur_offset = 0;
+ const char *name;
+ int i, top = 0;
+
+ anon_stack = kcalloc(BTF_ANON_STACK_MAX, sizeof(*anon_stack), GFP_KERNEL);
+ if (!anon_stack)
+ return ERR_PTR(-ENOMEM);
+
+retry:
+ if (!btf_type_is_struct(type)) {
+ member = ERR_PTR(-EINVAL);
+ goto out;
+ }
+
+ for_each_member(i, type, member) {
+ if (!member->name_off) {
+ /* Anonymous union/struct: push it for later use */
+ type = btf_type_skip_modifiers(btf, member->type, &tid);
+ if (type && top < BTF_ANON_STACK_MAX) {
+ anon_stack[top].tid = tid;
+ anon_stack[top++].offset =
+ cur_offset + member->offset;
+ }
+ } else {
+ name = btf_name_by_offset(btf, member->name_off);
+ if (name && !strcmp(member_name, name)) {
+ if (anon_offset)
+ *anon_offset = cur_offset;
+ goto out;
+ }
+ }
+ }
+ if (top > 0) {
+ /* Pop from the anonymous stack and retry */
+ tid = anon_stack[--top].tid;
+ cur_offset = anon_stack[top].offset;
+ type = btf_type_by_id(btf, tid);
+ goto retry;
+ }
+ member = NULL;
+
+out:
+ kfree(anon_stack);
+ return member;
+}
+
diff --git a/kernel/trace/trace_btf.h b/kernel/trace/trace_btf.h
new file mode 100644
index 000000000000..4bc44bc261e6
--- /dev/null
+++ b/kernel/trace/trace_btf.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/btf.h>
+
+const struct btf_type *btf_find_func_proto(const char *func_name,
+ struct btf **btf_p);
+const struct btf_param *btf_get_func_param(const struct btf_type *func_proto,
+ s32 *nr);
+const struct btf_member *btf_find_struct_member(struct btf *btf,
+ const struct btf_type *type,
+ const char *member_name,
+ u32 *anon_offset);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 340b2fa98218..c47422b20908 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -190,7 +190,7 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
F_STRUCT(
__field( int, size )
- __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
+ __stack_array( unsigned long, caller, FTRACE_STACK_ENTRIES, size)
),
F_printk("\t=> %ps\n\t=> %ps\n\t=> %ps\n"
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index a0a704ba27db..72714cbf475c 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -41,6 +41,10 @@ struct eprobe_data {
struct trace_eprobe *ep;
};
+
+#define for_each_trace_eprobe_tp(ep, _tp) \
+ list_for_each_entry(ep, trace_probe_probe_list(_tp), tp.list)
+
static int __trace_eprobe_create(int argc, const char *argv[]);
static void trace_event_probe_cleanup(struct trace_eprobe *ep)
@@ -640,7 +644,7 @@ static int disable_eprobe(struct trace_eprobe *ep,
static int enable_trace_eprobe(struct trace_event_call *call,
struct trace_event_file *file)
{
- struct trace_probe *pos, *tp;
+ struct trace_probe *tp;
struct trace_eprobe *ep;
bool enabled;
int ret = 0;
@@ -662,8 +666,7 @@ static int enable_trace_eprobe(struct trace_event_call *call,
if (enabled)
return 0;
- list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
- ep = container_of(pos, struct trace_eprobe, tp);
+ for_each_trace_eprobe_tp(ep, tp) {
ret = enable_eprobe(ep, file);
if (ret)
break;
@@ -680,8 +683,7 @@ static int enable_trace_eprobe(struct trace_event_call *call,
*/
WARN_ON_ONCE(ret != -ENOMEM);
- list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
- ep = container_of(pos, struct trace_eprobe, tp);
+ for_each_trace_eprobe_tp(ep, tp) {
disable_eprobe(ep, file->tr);
if (!--cnt)
break;
@@ -699,7 +701,7 @@ static int enable_trace_eprobe(struct trace_event_call *call,
static int disable_trace_eprobe(struct trace_event_call *call,
struct trace_event_file *file)
{
- struct trace_probe *pos, *tp;
+ struct trace_probe *tp;
struct trace_eprobe *ep;
tp = trace_probe_primary_from_call(call);
@@ -716,10 +718,8 @@ static int disable_trace_eprobe(struct trace_event_call *call,
trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
if (!trace_probe_is_enabled(tp)) {
- list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
- ep = container_of(pos, struct trace_eprobe, tp);
+ for_each_trace_eprobe_tp(ep, tp)
disable_eprobe(ep, file->tr);
- }
}
out:
@@ -807,13 +807,11 @@ static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[
int ret;
ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], &ctx);
- if (ret)
- return ret;
-
/* Handle symbols "@" */
if (!ret)
ret = traceprobe_update_arg(&ep->tp.args[i]);
+ traceprobe_finish_parse(&ctx);
return ret;
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 578f1f7d49a6..ed367d713be0 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -984,7 +984,7 @@ static void remove_subsystem(struct trace_subsystem_dir *dir)
return;
if (!--dir->nr_events) {
- tracefs_remove(dir->entry);
+ eventfs_remove(dir->ef);
list_del(&dir->list);
__put_system_dir(dir);
}
@@ -1005,7 +1005,7 @@ static void remove_event_file_dir(struct trace_event_file *file)
tracefs_remove(dir);
}
-
+ eventfs_remove(file->ef);
list_del(&file->list);
remove_subsystem(file->system);
free_event_filter(file->filter);
@@ -2291,13 +2291,13 @@ create_new_subsystem(const char *name)
return NULL;
}
-static struct dentry *
+static struct eventfs_file *
event_subsystem_dir(struct trace_array *tr, const char *name,
struct trace_event_file *file, struct dentry *parent)
{
struct event_subsystem *system, *iter;
struct trace_subsystem_dir *dir;
- struct dentry *entry;
+ int res;
/* First see if we did not already create this dir */
list_for_each_entry(dir, &tr->systems, list) {
@@ -2305,7 +2305,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
if (strcmp(system->name, name) == 0) {
dir->nr_events++;
file->system = dir;
- return dir->entry;
+ return dir->ef;
}
}
@@ -2329,8 +2329,8 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
} else
__get_system(system);
- dir->entry = tracefs_create_dir(name, parent);
- if (!dir->entry) {
+ dir->ef = eventfs_add_subsystem_dir(name, parent);
+ if (IS_ERR(dir->ef)) {
pr_warn("Failed to create system directory %s\n", name);
__put_system(system);
goto out_free;
@@ -2345,22 +2345,22 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
/* the ftrace system is special, do not create enable or filter files */
if (strcmp(name, "ftrace") != 0) {
- entry = tracefs_create_file("filter", TRACE_MODE_WRITE,
- dir->entry, dir,
+ res = eventfs_add_file("filter", TRACE_MODE_WRITE,
+ dir->ef, dir,
&ftrace_subsystem_filter_fops);
- if (!entry) {
+ if (res) {
kfree(system->filter);
system->filter = NULL;
pr_warn("Could not create tracefs '%s/filter' entry\n", name);
}
- trace_create_file("enable", TRACE_MODE_WRITE, dir->entry, dir,
+ eventfs_add_file("enable", TRACE_MODE_WRITE, dir->ef, dir,
&ftrace_system_enable_fops);
}
list_add(&dir->list, &tr->systems);
- return dir->entry;
+ return dir->ef;
out_free:
kfree(dir);
@@ -2413,36 +2413,37 @@ static int
event_create_dir(struct dentry *parent, struct trace_event_file *file)
{
struct trace_event_call *call = file->event_call;
+ struct eventfs_file *ef_subsystem = NULL;
struct trace_array *tr = file->tr;
- struct dentry *d_events;
const char *name;
int ret;
/*
* If the trace point header did not define TRACE_SYSTEM
- * then the system would be called "TRACE_SYSTEM".
+ * then the system would be called "TRACE_SYSTEM". This should
+ * never happen.
*/
- if (strcmp(call->class->system, TRACE_SYSTEM) != 0) {
- d_events = event_subsystem_dir(tr, call->class->system, file, parent);
- if (!d_events)
- return -ENOMEM;
- } else
- d_events = parent;
+ if (WARN_ON_ONCE(strcmp(call->class->system, TRACE_SYSTEM) == 0))
+ return -ENODEV;
+
+ ef_subsystem = event_subsystem_dir(tr, call->class->system, file, parent);
+ if (!ef_subsystem)
+ return -ENOMEM;
name = trace_event_name(call);
- file->dir = tracefs_create_dir(name, d_events);
- if (!file->dir) {
+ file->ef = eventfs_add_dir(name, ef_subsystem);
+ if (IS_ERR(file->ef)) {
pr_warn("Could not create tracefs '%s' directory\n", name);
return -1;
}
if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
- trace_create_file("enable", TRACE_MODE_WRITE, file->dir, file,
+ eventfs_add_file("enable", TRACE_MODE_WRITE, file->ef, file,
&ftrace_enable_fops);
#ifdef CONFIG_PERF_EVENTS
if (call->event.type && call->class->reg)
- trace_create_file("id", TRACE_MODE_READ, file->dir,
+ eventfs_add_file("id", TRACE_MODE_READ, file->ef,
(void *)(long)call->event.type,
&ftrace_event_id_fops);
#endif
@@ -2458,27 +2459,27 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
* triggers or filters.
*/
if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) {
- trace_create_file("filter", TRACE_MODE_WRITE, file->dir,
+ eventfs_add_file("filter", TRACE_MODE_WRITE, file->ef,
file, &ftrace_event_filter_fops);
- trace_create_file("trigger", TRACE_MODE_WRITE, file->dir,
+ eventfs_add_file("trigger", TRACE_MODE_WRITE, file->ef,
file, &event_trigger_fops);
}
#ifdef CONFIG_HIST_TRIGGERS
- trace_create_file("hist", TRACE_MODE_READ, file->dir, file,
+ eventfs_add_file("hist", TRACE_MODE_READ, file->ef, file,
&event_hist_fops);
#endif
#ifdef CONFIG_HIST_TRIGGERS_DEBUG
- trace_create_file("hist_debug", TRACE_MODE_READ, file->dir, file,
+ eventfs_add_file("hist_debug", TRACE_MODE_READ, file->ef, file,
&event_hist_debug_fops);
#endif
- trace_create_file("format", TRACE_MODE_READ, file->dir, call,
+ eventfs_add_file("format", TRACE_MODE_READ, file->ef, call,
&ftrace_event_format_fops);
#ifdef CONFIG_TRACE_EVENT_INJECT
if (call->event.type && call->class->reg)
- trace_create_file("inject", 0200, file->dir, file,
+ eventfs_add_file("inject", 0200, file->ef, file,
&event_inject_fops);
#endif
@@ -3631,21 +3632,22 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
{
struct dentry *d_events;
struct dentry *entry;
+ int error = 0;
entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent,
tr, &ftrace_set_event_fops);
if (!entry)
return -ENOMEM;
- d_events = tracefs_create_dir("events", parent);
- if (!d_events) {
+ d_events = eventfs_create_events_dir("events", parent);
+ if (IS_ERR(d_events)) {
pr_warn("Could not create tracefs 'events' directory\n");
return -ENOMEM;
}
- entry = trace_create_file("enable", TRACE_MODE_WRITE, d_events,
+ error = eventfs_add_events_file("enable", TRACE_MODE_WRITE, d_events,
tr, &ftrace_tr_enable_fops);
- if (!entry)
+ if (error)
return -ENOMEM;
/* There are not as crucial, just warn if they are not created */
@@ -3658,11 +3660,11 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
&ftrace_set_event_notrace_pid_fops);
/* ring buffer internal formats */
- trace_create_file("header_page", TRACE_MODE_READ, d_events,
+ eventfs_add_events_file("header_page", TRACE_MODE_READ, d_events,
ring_buffer_print_page_header,
&ftrace_show_header_fops);
- trace_create_file("header_event", TRACE_MODE_READ, d_events,
+ eventfs_add_events_file("header_event", TRACE_MODE_READ, d_events,
ring_buffer_print_entry_header,
&ftrace_show_header_fops);
@@ -3750,7 +3752,7 @@ int event_trace_del_tracer(struct trace_array *tr)
down_write(&trace_event_sem);
__trace_remove_event_dirs(tr);
- tracefs_remove(tr->event_dir);
+ eventfs_remove_events_dir(tr->event_dir);
up_write(&trace_event_sem);
tr->event_dir = NULL;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 1dad64267878..33264e510d16 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -46,15 +46,19 @@ static const char * ops[] = { OPS };
enum filter_pred_fn {
FILTER_PRED_FN_NOP,
FILTER_PRED_FN_64,
+ FILTER_PRED_FN_64_CPUMASK,
FILTER_PRED_FN_S64,
FILTER_PRED_FN_U64,
FILTER_PRED_FN_32,
+ FILTER_PRED_FN_32_CPUMASK,
FILTER_PRED_FN_S32,
FILTER_PRED_FN_U32,
FILTER_PRED_FN_16,
+ FILTER_PRED_FN_16_CPUMASK,
FILTER_PRED_FN_S16,
FILTER_PRED_FN_U16,
FILTER_PRED_FN_8,
+ FILTER_PRED_FN_8_CPUMASK,
FILTER_PRED_FN_S8,
FILTER_PRED_FN_U8,
FILTER_PRED_FN_COMM,
@@ -64,21 +68,25 @@ enum filter_pred_fn {
FILTER_PRED_FN_PCHAR_USER,
FILTER_PRED_FN_PCHAR,
FILTER_PRED_FN_CPU,
+ FILTER_PRED_FN_CPU_CPUMASK,
+ FILTER_PRED_FN_CPUMASK,
+ FILTER_PRED_FN_CPUMASK_CPU,
FILTER_PRED_FN_FUNCTION,
FILTER_PRED_FN_,
FILTER_PRED_TEST_VISITED,
};
struct filter_pred {
- enum filter_pred_fn fn_num;
- u64 val;
- u64 val2;
- struct regex regex;
+ struct regex *regex;
+ struct cpumask *mask;
unsigned short *ops;
struct ftrace_event_field *field;
- int offset;
+ u64 val;
+ u64 val2;
+ enum filter_pred_fn fn_num;
+ int offset;
int not;
- int op;
+ int op;
};
/*
@@ -94,6 +102,8 @@ struct filter_pred {
C(TOO_MANY_OPEN, "Too many '('"), \
C(TOO_MANY_CLOSE, "Too few '('"), \
C(MISSING_QUOTE, "Missing matching quote"), \
+ C(MISSING_BRACE_OPEN, "Missing '{'"), \
+ C(MISSING_BRACE_CLOSE, "Missing '}'"), \
C(OPERAND_TOO_LONG, "Operand too long"), \
C(EXPECT_STRING, "Expecting string field"), \
C(EXPECT_DIGIT, "Expecting numeric field"), \
@@ -103,6 +113,7 @@ struct filter_pred {
C(BAD_SUBSYS_FILTER, "Couldn't find or set field in one of a subsystem's events"), \
C(TOO_MANY_PREDS, "Too many terms in predicate expression"), \
C(INVALID_FILTER, "Meaningless filter expression"), \
+ C(INVALID_CPULIST, "Invalid cpulist"), \
C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \
C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), \
C(NO_FUNCTION, "Function not found"), \
@@ -186,6 +197,15 @@ enum {
PROCESS_OR = 4,
};
+static void free_predicate(struct filter_pred *pred)
+{
+ if (pred) {
+ kfree(pred->regex);
+ kfree(pred->mask);
+ kfree(pred);
+ }
+}
+
/*
* Without going into a formal proof, this explains the method that is used in
* parsing the logical expressions.
@@ -623,12 +643,64 @@ out_free:
kfree(inverts);
if (prog_stack) {
for (i = 0; prog_stack[i].pred; i++)
- kfree(prog_stack[i].pred);
+ free_predicate(prog_stack[i].pred);
kfree(prog_stack);
}
return ERR_PTR(ret);
}
+static inline int
+do_filter_cpumask(int op, const struct cpumask *mask, const struct cpumask *cmp)
+{
+ switch (op) {
+ case OP_EQ:
+ return cpumask_equal(mask, cmp);
+ case OP_NE:
+ return !cpumask_equal(mask, cmp);
+ case OP_BAND:
+ return cpumask_intersects(mask, cmp);
+ default:
+ return 0;
+ }
+}
+
+/* Optimisation of do_filter_cpumask() for scalar fields */
+static inline int
+do_filter_scalar_cpumask(int op, unsigned int cpu, const struct cpumask *mask)
+{
+ /*
+ * Per the weight-of-one cpumask optimisations, the mask passed in this
+ * function has a weight >= 2, so it is never equal to a single scalar.
+ */
+ switch (op) {
+ case OP_EQ:
+ return false;
+ case OP_NE:
+ return true;
+ case OP_BAND:
+ return cpumask_test_cpu(cpu, mask);
+ default:
+ return 0;
+ }
+}
+
+static inline int
+do_filter_cpumask_scalar(int op, const struct cpumask *mask, unsigned int cpu)
+{
+ switch (op) {
+ case OP_EQ:
+ return cpumask_test_cpu(cpu, mask) &&
+ cpumask_nth(1, mask) >= nr_cpu_ids;
+ case OP_NE:
+ return !cpumask_test_cpu(cpu, mask) ||
+ cpumask_nth(1, mask) < nr_cpu_ids;
+ case OP_BAND:
+ return cpumask_test_cpu(cpu, mask);
+ default:
+ return 0;
+ }
+}
+
enum pred_cmp_types {
PRED_CMP_TYPE_NOP,
PRED_CMP_TYPE_LT,
@@ -672,6 +744,18 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \
} \
}
+#define DEFINE_CPUMASK_COMPARISON_PRED(size) \
+static int filter_pred_##size##_cpumask(struct filter_pred *pred, void *event) \
+{ \
+ u##size *addr = (u##size *)(event + pred->offset); \
+ unsigned int cpu = *addr; \
+ \
+ if (cpu >= nr_cpu_ids) \
+ return 0; \
+ \
+ return do_filter_scalar_cpumask(pred->op, cpu, pred->mask); \
+}
+
#define DEFINE_EQUALITY_PRED(size) \
static int filter_pred_##size(struct filter_pred *pred, void *event) \
{ \
@@ -693,6 +777,11 @@ DEFINE_COMPARISON_PRED(u16);
DEFINE_COMPARISON_PRED(s8);
DEFINE_COMPARISON_PRED(u8);
+DEFINE_CPUMASK_COMPARISON_PRED(64);
+DEFINE_CPUMASK_COMPARISON_PRED(32);
+DEFINE_CPUMASK_COMPARISON_PRED(16);
+DEFINE_CPUMASK_COMPARISON_PRED(8);
+
DEFINE_EQUALITY_PRED(64);
DEFINE_EQUALITY_PRED(32);
DEFINE_EQUALITY_PRED(16);
@@ -750,7 +839,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event)
char *addr = (char *)(event + pred->offset);
int cmp, match;
- cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len);
+ cmp = pred->regex->match(addr, pred->regex, pred->regex->field_len);
match = cmp ^ pred->not;
@@ -763,7 +852,7 @@ static __always_inline int filter_pchar(struct filter_pred *pred, char *str)
int len;
len = strlen(str) + 1; /* including tailing '\0' */
- cmp = pred->regex.match(str, &pred->regex, len);
+ cmp = pred->regex->match(str, pred->regex, len);
match = cmp ^ pred->not;
@@ -813,7 +902,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event)
char *addr = (char *)(event + str_loc);
int cmp, match;
- cmp = pred->regex.match(addr, &pred->regex, str_len);
+ cmp = pred->regex->match(addr, pred->regex, str_len);
match = cmp ^ pred->not;
@@ -836,7 +925,7 @@ static int filter_pred_strrelloc(struct filter_pred *pred, void *event)
char *addr = (char *)(&item[1]) + str_loc;
int cmp, match;
- cmp = pred->regex.match(addr, &pred->regex, str_len);
+ cmp = pred->regex->match(addr, pred->regex, str_len);
match = cmp ^ pred->not;
@@ -869,12 +958,42 @@ static int filter_pred_cpu(struct filter_pred *pred, void *event)
}
}
+/* Filter predicate for current CPU vs user-provided cpumask */
+static int filter_pred_cpu_cpumask(struct filter_pred *pred, void *event)
+{
+ int cpu = raw_smp_processor_id();
+
+ return do_filter_scalar_cpumask(pred->op, cpu, pred->mask);
+}
+
+/* Filter predicate for cpumask field vs user-provided cpumask */
+static int filter_pred_cpumask(struct filter_pred *pred, void *event)
+{
+ u32 item = *(u32 *)(event + pred->offset);
+ int loc = item & 0xffff;
+ const struct cpumask *mask = (event + loc);
+ const struct cpumask *cmp = pred->mask;
+
+ return do_filter_cpumask(pred->op, mask, cmp);
+}
+
+/* Filter predicate for cpumask field vs user-provided scalar */
+static int filter_pred_cpumask_cpu(struct filter_pred *pred, void *event)
+{
+ u32 item = *(u32 *)(event + pred->offset);
+ int loc = item & 0xffff;
+ const struct cpumask *mask = (event + loc);
+ unsigned int cpu = pred->val;
+
+ return do_filter_cpumask_scalar(pred->op, mask, cpu);
+}
+
/* Filter predicate for COMM. */
static int filter_pred_comm(struct filter_pred *pred, void *event)
{
int cmp;
- cmp = pred->regex.match(current->comm, &pred->regex,
+ cmp = pred->regex->match(current->comm, pred->regex,
TASK_COMM_LEN);
return cmp ^ pred->not;
}
@@ -1004,7 +1123,7 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
static void filter_build_regex(struct filter_pred *pred)
{
- struct regex *r = &pred->regex;
+ struct regex *r = pred->regex;
char *search;
enum regex_type type = MATCH_FULL;
@@ -1169,7 +1288,7 @@ static void free_prog(struct event_filter *filter)
return;
for (i = 0; prog[i].pred; i++)
- kfree(prog[i].pred);
+ free_predicate(prog[i].pred);
kfree(prog);
}
@@ -1236,8 +1355,12 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
int filter_assign_type(const char *type)
{
- if (strstr(type, "__data_loc") && strstr(type, "char"))
- return FILTER_DYN_STRING;
+ if (strstr(type, "__data_loc")) {
+ if (strstr(type, "char"))
+ return FILTER_DYN_STRING;
+ if (strstr(type, "cpumask_t"))
+ return FILTER_CPUMASK;
+ }
if (strstr(type, "__rel_loc") && strstr(type, "char"))
return FILTER_RDYN_STRING;
@@ -1313,24 +1436,32 @@ static int filter_pred_fn_call(struct filter_pred *pred, void *event)
switch (pred->fn_num) {
case FILTER_PRED_FN_64:
return filter_pred_64(pred, event);
+ case FILTER_PRED_FN_64_CPUMASK:
+ return filter_pred_64_cpumask(pred, event);
case FILTER_PRED_FN_S64:
return filter_pred_s64(pred, event);
case FILTER_PRED_FN_U64:
return filter_pred_u64(pred, event);
case FILTER_PRED_FN_32:
return filter_pred_32(pred, event);
+ case FILTER_PRED_FN_32_CPUMASK:
+ return filter_pred_32_cpumask(pred, event);
case FILTER_PRED_FN_S32:
return filter_pred_s32(pred, event);
case FILTER_PRED_FN_U32:
return filter_pred_u32(pred, event);
case FILTER_PRED_FN_16:
return filter_pred_16(pred, event);
+ case FILTER_PRED_FN_16_CPUMASK:
+ return filter_pred_16_cpumask(pred, event);
case FILTER_PRED_FN_S16:
return filter_pred_s16(pred, event);
case FILTER_PRED_FN_U16:
return filter_pred_u16(pred, event);
case FILTER_PRED_FN_8:
return filter_pred_8(pred, event);
+ case FILTER_PRED_FN_8_CPUMASK:
+ return filter_pred_8_cpumask(pred, event);
case FILTER_PRED_FN_S8:
return filter_pred_s8(pred, event);
case FILTER_PRED_FN_U8:
@@ -1349,6 +1480,12 @@ static int filter_pred_fn_call(struct filter_pred *pred, void *event)
return filter_pred_pchar(pred, event);
case FILTER_PRED_FN_CPU:
return filter_pred_cpu(pred, event);
+ case FILTER_PRED_FN_CPU_CPUMASK:
+ return filter_pred_cpu_cpumask(pred, event);
+ case FILTER_PRED_FN_CPUMASK:
+ return filter_pred_cpumask(pred, event);
+ case FILTER_PRED_FN_CPUMASK_CPU:
+ return filter_pred_cpumask_cpu(pred, event);
case FILTER_PRED_FN_FUNCTION:
return filter_pred_function(pred, event);
case FILTER_PRED_TEST_VISITED:
@@ -1553,9 +1690,130 @@ static int parse_pred(const char *str, void *data,
goto err_free;
}
- pred->regex.len = len;
- strncpy(pred->regex.pattern, str + s, len);
- pred->regex.pattern[len] = 0;
+ pred->regex = kzalloc(sizeof(*pred->regex), GFP_KERNEL);
+ if (!pred->regex)
+ goto err_mem;
+ pred->regex->len = len;
+ strncpy(pred->regex->pattern, str + s, len);
+ pred->regex->pattern[len] = 0;
+
+ } else if (!strncmp(str + i, "CPUS", 4)) {
+ unsigned int maskstart;
+ bool single;
+ char *tmp;
+
+ switch (field->filter_type) {
+ case FILTER_CPUMASK:
+ case FILTER_CPU:
+ case FILTER_OTHER:
+ break;
+ default:
+ parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i);
+ goto err_free;
+ }
+
+ switch (op) {
+ case OP_EQ:
+ case OP_NE:
+ case OP_BAND:
+ break;
+ default:
+ parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i);
+ goto err_free;
+ }
+
+ /* Skip CPUS */
+ i += 4;
+ if (str[i++] != '{') {
+ parse_error(pe, FILT_ERR_MISSING_BRACE_OPEN, pos + i);
+ goto err_free;
+ }
+ maskstart = i;
+
+ /* Walk the cpulist until closing } */
+ for (; str[i] && str[i] != '}'; i++)
+ ;
+
+ if (str[i] != '}') {
+ parse_error(pe, FILT_ERR_MISSING_BRACE_CLOSE, pos + i);
+ goto err_free;
+ }
+
+ if (maskstart == i) {
+ parse_error(pe, FILT_ERR_INVALID_CPULIST, pos + i);
+ goto err_free;
+ }
+
+ /* Copy the cpulist between { and } */
+ tmp = kmalloc((i - maskstart) + 1, GFP_KERNEL);
+ if (!tmp)
+ goto err_mem;
+
+ strscpy(tmp, str + maskstart, (i - maskstart) + 1);
+ pred->mask = kzalloc(cpumask_size(), GFP_KERNEL);
+ if (!pred->mask) {
+ kfree(tmp);
+ goto err_mem;
+ }
+
+ /* Now parse it */
+ if (cpulist_parse(tmp, pred->mask)) {
+ kfree(tmp);
+ parse_error(pe, FILT_ERR_INVALID_CPULIST, pos + i);
+ goto err_free;
+ }
+ kfree(tmp);
+
+ /* Move along */
+ i++;
+
+ /*
+ * Optimisation: if the user-provided mask has a weight of one
+ * then we can treat it as a scalar input.
+ */
+ single = cpumask_weight(pred->mask) == 1;
+ if (single) {
+ pred->val = cpumask_first(pred->mask);
+ kfree(pred->mask);
+ pred->mask = NULL;
+ }
+
+ if (field->filter_type == FILTER_CPUMASK) {
+ pred->fn_num = single ?
+ FILTER_PRED_FN_CPUMASK_CPU :
+ FILTER_PRED_FN_CPUMASK;
+ } else if (field->filter_type == FILTER_CPU) {
+ if (single) {
+ if (pred->op == OP_BAND)
+ pred->op = OP_EQ;
+
+ pred->fn_num = FILTER_PRED_FN_CPU;
+ } else {
+ pred->fn_num = FILTER_PRED_FN_CPU_CPUMASK;
+ }
+ } else if (single) {
+ if (pred->op == OP_BAND)
+ pred->op = OP_EQ;
+
+ pred->fn_num = select_comparison_fn(pred->op, field->size, false);
+ if (pred->op == OP_NE)
+ pred->not = 1;
+ } else {
+ switch (field->size) {
+ case 8:
+ pred->fn_num = FILTER_PRED_FN_64_CPUMASK;
+ break;
+ case 4:
+ pred->fn_num = FILTER_PRED_FN_32_CPUMASK;
+ break;
+ case 2:
+ pred->fn_num = FILTER_PRED_FN_16_CPUMASK;
+ break;
+ case 1:
+ pred->fn_num = FILTER_PRED_FN_8_CPUMASK;
+ break;
+ }
+ }
/* This is either a string, or an integer */
} else if (str[i] == '\'' || str[i] == '"') {
@@ -1597,9 +1855,12 @@ static int parse_pred(const char *str, void *data,
goto err_free;
}
- pred->regex.len = len;
- strncpy(pred->regex.pattern, str + s, len);
- pred->regex.pattern[len] = 0;
+ pred->regex = kzalloc(sizeof(*pred->regex), GFP_KERNEL);
+ if (!pred->regex)
+ goto err_mem;
+ pred->regex->len = len;
+ strncpy(pred->regex->pattern, str + s, len);
+ pred->regex->pattern[len] = 0;
filter_build_regex(pred);
@@ -1608,7 +1869,7 @@ static int parse_pred(const char *str, void *data,
} else if (field->filter_type == FILTER_STATIC_STRING) {
pred->fn_num = FILTER_PRED_FN_STRING;
- pred->regex.field_len = field->size;
+ pred->regex->field_len = field->size;
} else if (field->filter_type == FILTER_DYN_STRING) {
pred->fn_num = FILTER_PRED_FN_STRLOC;
@@ -1691,10 +1952,10 @@ static int parse_pred(const char *str, void *data,
return i;
err_free:
- kfree(pred);
+ free_predicate(pred);
return -EINVAL;
err_mem:
- kfree(pred);
+ free_predicate(pred);
return -ENOMEM;
}
@@ -2287,8 +2548,8 @@ static int ftrace_function_set_filter_pred(struct filter_pred *pred,
return ret;
return __ftrace_function_set_filter(pred->op == OP_EQ,
- pred->regex.pattern,
- pred->regex.len,
+ pred->regex->pattern,
+ pred->regex->len,
data);
}
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 33cb6af31f39..6f046650e527 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -1328,14 +1328,14 @@ static int user_field_set_string(struct ftrace_event_field *field,
static int user_event_set_print_fmt(struct user_event *user, char *buf, int len)
{
- struct ftrace_event_field *field, *next;
+ struct ftrace_event_field *field;
struct list_head *head = &user->fields;
int pos = 0, depth = 0;
const char *str_func;
pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
- list_for_each_entry_safe_reverse(field, next, head, link) {
+ list_for_each_entry_reverse(field, head, link) {
if (depth != 0)
pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
@@ -1347,7 +1347,7 @@ static int user_event_set_print_fmt(struct user_event *user, char *buf, int len)
pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
- list_for_each_entry_safe_reverse(field, next, head, link) {
+ list_for_each_entry_reverse(field, head, link) {
if (user_field_is_dyn_string(field->type, &str_func))
pos += snprintf(buf + pos, LEN_OR_ZERO,
", %s(%s)", str_func, field->name);
@@ -1732,7 +1732,7 @@ static int user_event_create(const char *raw_command)
static int user_event_show(struct seq_file *m, struct dyn_event *ev)
{
struct user_event *user = container_of(ev, struct user_event, devent);
- struct ftrace_event_field *field, *next;
+ struct ftrace_event_field *field;
struct list_head *head;
int depth = 0;
@@ -1740,7 +1740,7 @@ static int user_event_show(struct seq_file *m, struct dyn_event *ev)
head = trace_get_fields(&user->call);
- list_for_each_entry_safe_reverse(field, next, head, link) {
+ list_for_each_entry_reverse(field, head, link) {
if (depth == 0)
seq_puts(m, " ");
else
@@ -1816,13 +1816,14 @@ out:
static bool user_fields_match(struct user_event *user, int argc,
const char **argv)
{
- struct ftrace_event_field *field, *next;
+ struct ftrace_event_field *field;
struct list_head *head = &user->fields;
int i = 0;
- list_for_each_entry_safe_reverse(field, next, head, link)
+ list_for_each_entry_reverse(field, head, link) {
if (!user_field_match(field, argc, argv, &i))
return false;
+ }
if (i != argc)
return false;
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 58f3946081e2..1698fc22afa0 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -51,6 +51,9 @@ static int ftrace_event_register(struct trace_event_call *call,
#undef __array
#define __array(type, item, size) type item[size];
+#undef __stack_array
+#define __stack_array(type, item, size, field) __array(type, item, size)
+
#undef __array_desc
#define __array_desc(type, container, item, size) type item[size];
@@ -114,6 +117,9 @@ static void __always_unused ____ftrace_check_##name(void) \
is_signed_type(_type), .filter_type = FILTER_OTHER, \
.len = _len },
+#undef __stack_array
+#define __stack_array(_type, _item, _len, _field) __array(_type, _item, _len)
+
#undef __array_desc
#define __array_desc(_type, _container, _item, _len) __array(_type, _item, _len)
@@ -149,6 +155,9 @@ static struct trace_event_fields ftrace_event_fields_##name[] = { \
#undef __array
#define __array(type, item, len)
+#undef __stack_array
+#define __stack_array(type, item, len, field)
+
#undef __array_desc
#define __array_desc(type, container, item, len)
diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
index dfe2e546acdc..8bfe23af9c73 100644
--- a/kernel/trace/trace_fprobe.c
+++ b/kernel/trace/trace_fprobe.c
@@ -898,6 +898,46 @@ static struct tracepoint *find_tracepoint(const char *tp_name)
return data.tpoint;
}
+static int parse_symbol_and_return(int argc, const char *argv[],
+ char **symbol, bool *is_return,
+ bool is_tracepoint)
+{
+ char *tmp = strchr(argv[1], '%');
+ int i;
+
+ if (tmp) {
+ int len = tmp - argv[1];
+
+ if (!is_tracepoint && !strcmp(tmp, "%return")) {
+ *is_return = true;
+ } else {
+ trace_probe_log_err(len, BAD_ADDR_SUFFIX);
+ return -EINVAL;
+ }
+ *symbol = kmemdup_nul(argv[1], len, GFP_KERNEL);
+ } else
+ *symbol = kstrdup(argv[1], GFP_KERNEL);
+ if (!*symbol)
+ return -ENOMEM;
+
+ if (*is_return)
+ return 0;
+
+ /* If there is $retval, this should be a return fprobe. */
+ for (i = 2; i < argc; i++) {
+ tmp = strstr(argv[i], "$retval");
+ if (tmp && !isalnum(tmp[7]) && tmp[7] != '_') {
+ *is_return = true;
+ /*
+ * NOTE: Don't check is_tracepoint here, because it will
+ * be checked when the argument is parsed.
+ */
+ break;
+ }
+ }
+ return 0;
+}
+
static int __trace_fprobe_create(int argc, const char *argv[])
{
/*
@@ -927,7 +967,7 @@ static int __trace_fprobe_create(int argc, const char *argv[])
struct trace_fprobe *tf = NULL;
int i, len, new_argc = 0, ret = 0;
bool is_return = false;
- char *symbol = NULL, *tmp = NULL;
+ char *symbol = NULL;
const char *event = NULL, *group = FPROBE_EVENT_SYSTEM;
const char **new_argv = NULL;
int maxactive = 0;
@@ -983,20 +1023,10 @@ static int __trace_fprobe_create(int argc, const char *argv[])
trace_probe_log_set_index(1);
/* a symbol(or tracepoint) must be specified */
- symbol = kstrdup(argv[1], GFP_KERNEL);
- if (!symbol)
- return -ENOMEM;
+ ret = parse_symbol_and_return(argc, argv, &symbol, &is_return, is_tracepoint);
+ if (ret < 0)
+ goto parse_error;
- tmp = strchr(symbol, '%');
- if (tmp) {
- if (!is_tracepoint && !strcmp(tmp, "%return")) {
- *tmp = '\0';
- is_return = true;
- } else {
- trace_probe_log_err(tmp - symbol, BAD_ADDR_SUFFIX);
- goto parse_error;
- }
- }
if (!is_return && maxactive) {
trace_probe_log_set_index(0);
trace_probe_log_err(1, BAD_MAXACT_TYPE);
@@ -1096,6 +1126,7 @@ static int __trace_fprobe_create(int argc, const char *argv[])
}
out:
+ traceprobe_finish_parse(&ctx);
trace_probe_log_clear();
kfree(new_argv);
kfree(symbol);
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 2f37a6e68aa9..b791524a6536 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -635,7 +635,7 @@ static int s_mode_show(struct seq_file *s, void *v)
else
seq_printf(s, "%s", thread_mode_str[mode]);
- if (mode != MODE_MAX)
+ if (mode < MODE_MAX - 1) /* if mode is any but last */
seq_puts(s, " ");
return 0;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 17c21c0b2dd1..3d7a180a8427 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -907,6 +907,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
}
out:
+ traceprobe_finish_parse(&ctx);
trace_probe_log_clear();
kfree(new_argv);
kfree(symbol);
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index c68a72707852..4dc74d73fc1d 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -12,6 +12,7 @@
#define pr_fmt(fmt) "trace_probe: " fmt
#include <linux/bpf.h>
+#include "trace_btf.h"
#include "trace_probe.h"
@@ -304,31 +305,90 @@ static int parse_trace_event_arg(char *arg, struct fetch_insn *code,
#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS
-static struct btf *traceprobe_get_btf(void)
+static u32 btf_type_int(const struct btf_type *t)
{
- struct btf *btf = bpf_get_btf_vmlinux();
+ return *(u32 *)(t + 1);
+}
- if (IS_ERR_OR_NULL(btf))
- return NULL;
+static bool btf_type_is_char_ptr(struct btf *btf, const struct btf_type *type)
+{
+ const struct btf_type *real_type;
+ u32 intdata;
+ s32 tid;
+
+ real_type = btf_type_skip_modifiers(btf, type->type, &tid);
+ if (!real_type)
+ return false;
+
+ if (BTF_INFO_KIND(real_type->info) != BTF_KIND_INT)
+ return false;
- return btf;
+ intdata = btf_type_int(real_type);
+ return !(BTF_INT_ENCODING(intdata) & BTF_INT_SIGNED)
+ && BTF_INT_BITS(intdata) == 8;
}
-static u32 btf_type_int(const struct btf_type *t)
+static bool btf_type_is_char_array(struct btf *btf, const struct btf_type *type)
{
- return *(u32 *)(t + 1);
+ const struct btf_type *real_type;
+ const struct btf_array *array;
+ u32 intdata;
+ s32 tid;
+
+ if (BTF_INFO_KIND(type->info) != BTF_KIND_ARRAY)
+ return false;
+
+ array = (const struct btf_array *)(type + 1);
+
+ real_type = btf_type_skip_modifiers(btf, array->type, &tid);
+
+ intdata = btf_type_int(real_type);
+ return !(BTF_INT_ENCODING(intdata) & BTF_INT_SIGNED)
+ && BTF_INT_BITS(intdata) == 8;
}
-static const char *type_from_btf_id(struct btf *btf, s32 id)
+static int check_prepare_btf_string_fetch(char *typename,
+ struct fetch_insn **pcode,
+ struct traceprobe_parse_context *ctx)
+{
+ struct btf *btf = ctx->btf;
+
+ if (!btf || !ctx->last_type)
+ return 0;
+
+ /* char [] does not need any change. */
+ if (btf_type_is_char_array(btf, ctx->last_type))
+ return 0;
+
+ /* char * requires dereference the pointer. */
+ if (btf_type_is_char_ptr(btf, ctx->last_type)) {
+ struct fetch_insn *code = *pcode + 1;
+
+ if (code->op == FETCH_OP_END) {
+ trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
+ return -E2BIG;
+ }
+ if (typename[0] == 'u')
+ code->op = FETCH_OP_UDEREF;
+ else
+ code->op = FETCH_OP_DEREF;
+ code->offset = 0;
+ *pcode = code;
+ return 0;
+ }
+ /* Other types are not available for string */
+ trace_probe_log_err(ctx->offset, BAD_TYPE4STR);
+ return -EINVAL;
+}
+
+static const char *fetch_type_from_btf_type(struct btf *btf,
+ const struct btf_type *type,
+ struct traceprobe_parse_context *ctx)
{
- const struct btf_type *t;
u32 intdata;
- s32 tid;
/* TODO: const char * could be converted as a string */
- t = btf_type_skip_modifiers(btf, id, &tid);
-
- switch (BTF_INFO_KIND(t->info)) {
+ switch (BTF_INFO_KIND(type->info)) {
case BTF_KIND_ENUM:
/* enum is "int", so convert to "s32" */
return "s32";
@@ -341,7 +401,7 @@ static const char *type_from_btf_id(struct btf *btf, s32 id)
else
return "x32";
case BTF_KIND_INT:
- intdata = btf_type_int(t);
+ intdata = btf_type_int(type);
if (BTF_INT_ENCODING(intdata) & BTF_INT_SIGNED) {
switch (BTF_INT_BITS(intdata)) {
case 8:
@@ -364,6 +424,10 @@ static const char *type_from_btf_id(struct btf *btf, s32 id)
case 64:
return "u64";
}
+ /* bitfield, size is encoded in the type */
+ ctx->last_bitsize = BTF_INT_BITS(intdata);
+ ctx->last_bitoffs += BTF_INT_OFFSET(intdata);
+ return "u64";
}
}
/* TODO: support other types */
@@ -371,88 +435,223 @@ static const char *type_from_btf_id(struct btf *btf, s32 id)
return NULL;
}
-static const struct btf_type *find_btf_func_proto(const char *funcname)
+static int query_btf_context(struct traceprobe_parse_context *ctx)
{
- struct btf *btf = traceprobe_get_btf();
- const struct btf_type *t;
- s32 id;
+ const struct btf_param *param;
+ const struct btf_type *type;
+ struct btf *btf;
+ s32 nr;
- if (!btf || !funcname)
- return ERR_PTR(-EINVAL);
+ if (ctx->btf)
+ return 0;
+
+ if (!ctx->funcname)
+ return -EINVAL;
+
+ type = btf_find_func_proto(ctx->funcname, &btf);
+ if (!type)
+ return -ENOENT;
- id = btf_find_by_name_kind(btf, funcname, BTF_KIND_FUNC);
- if (id <= 0)
- return ERR_PTR(-ENOENT);
+ ctx->btf = btf;
+ ctx->proto = type;
+
+ /* ctx->params is optional, since func(void) will not have params. */
+ nr = 0;
+ param = btf_get_func_param(type, &nr);
+ if (!IS_ERR_OR_NULL(param)) {
+ /* Hide the first 'data' argument of tracepoint */
+ if (ctx->flags & TPARG_FL_TPOINT) {
+ nr--;
+ param++;
+ }
+ }
- /* Get BTF_KIND_FUNC type */
- t = btf_type_by_id(btf, id);
- if (!t || !btf_type_is_func(t))
- return ERR_PTR(-ENOENT);
+ if (nr > 0) {
+ ctx->nr_params = nr;
+ ctx->params = param;
+ } else {
+ ctx->nr_params = 0;
+ ctx->params = NULL;
+ }
- /* The type of BTF_KIND_FUNC is BTF_KIND_FUNC_PROTO */
- t = btf_type_by_id(btf, t->type);
- if (!t || !btf_type_is_func_proto(t))
- return ERR_PTR(-ENOENT);
+ return 0;
+}
- return t;
+static void clear_btf_context(struct traceprobe_parse_context *ctx)
+{
+ if (ctx->btf) {
+ btf_put(ctx->btf);
+ ctx->btf = NULL;
+ ctx->proto = NULL;
+ ctx->params = NULL;
+ ctx->nr_params = 0;
+ }
}
-static const struct btf_param *find_btf_func_param(const char *funcname, s32 *nr,
- bool tracepoint)
+/* Return 1 if the field separater is arrow operator ('->') */
+static int split_next_field(char *varname, char **next_field,
+ struct traceprobe_parse_context *ctx)
{
- const struct btf_param *param;
- const struct btf_type *t;
+ char *field;
+ int ret = 0;
+
+ field = strpbrk(varname, ".-");
+ if (field) {
+ if (field[0] == '-' && field[1] == '>') {
+ field[0] = '\0';
+ field += 2;
+ ret = 1;
+ } else if (field[0] == '.') {
+ field[0] = '\0';
+ field += 1;
+ } else {
+ trace_probe_log_err(ctx->offset + field - varname, BAD_HYPHEN);
+ return -EINVAL;
+ }
+ *next_field = field;
+ }
- if (!funcname || !nr)
- return ERR_PTR(-EINVAL);
+ return ret;
+}
- t = find_btf_func_proto(funcname);
- if (IS_ERR(t))
- return (const struct btf_param *)t;
+/*
+ * Parse the field of data structure. The @type must be a pointer type
+ * pointing the target data structure type.
+ */
+static int parse_btf_field(char *fieldname, const struct btf_type *type,
+ struct fetch_insn **pcode, struct fetch_insn *end,
+ struct traceprobe_parse_context *ctx)
+{
+ struct fetch_insn *code = *pcode;
+ const struct btf_member *field;
+ u32 bitoffs, anon_offs;
+ char *next;
+ int is_ptr;
+ s32 tid;
- *nr = btf_type_vlen(t);
- param = (const struct btf_param *)(t + 1);
+ do {
+ /* Outer loop for solving arrow operator ('->') */
+ if (BTF_INFO_KIND(type->info) != BTF_KIND_PTR) {
+ trace_probe_log_err(ctx->offset, NO_PTR_STRCT);
+ return -EINVAL;
+ }
+ /* Convert a struct pointer type to a struct type */
+ type = btf_type_skip_modifiers(ctx->btf, type->type, &tid);
+ if (!type) {
+ trace_probe_log_err(ctx->offset, BAD_BTF_TID);
+ return -EINVAL;
+ }
- /* Hide the first 'data' argument of tracepoint */
- if (tracepoint) {
- (*nr)--;
- param++;
- }
+ bitoffs = 0;
+ do {
+ /* Inner loop for solving dot operator ('.') */
+ next = NULL;
+ is_ptr = split_next_field(fieldname, &next, ctx);
+ if (is_ptr < 0)
+ return is_ptr;
+
+ anon_offs = 0;
+ field = btf_find_struct_member(ctx->btf, type, fieldname,
+ &anon_offs);
+ if (!field) {
+ trace_probe_log_err(ctx->offset, NO_BTF_FIELD);
+ return -ENOENT;
+ }
+ /* Add anonymous structure/union offset */
+ bitoffs += anon_offs;
+
+ /* Accumulate the bit-offsets of the dot-connected fields */
+ if (btf_type_kflag(type)) {
+ bitoffs += BTF_MEMBER_BIT_OFFSET(field->offset);
+ ctx->last_bitsize = BTF_MEMBER_BITFIELD_SIZE(field->offset);
+ } else {
+ bitoffs += field->offset;
+ ctx->last_bitsize = 0;
+ }
- if (*nr > 0)
- return param;
- else
- return NULL;
+ type = btf_type_skip_modifiers(ctx->btf, field->type, &tid);
+ if (!type) {
+ trace_probe_log_err(ctx->offset, BAD_BTF_TID);
+ return -EINVAL;
+ }
+
+ ctx->offset += next - fieldname;
+ fieldname = next;
+ } while (!is_ptr && fieldname);
+
+ if (++code == end) {
+ trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
+ return -EINVAL;
+ }
+ code->op = FETCH_OP_DEREF; /* TODO: user deref support */
+ code->offset = bitoffs / 8;
+ *pcode = code;
+
+ ctx->last_bitoffs = bitoffs % 8;
+ ctx->last_type = type;
+ } while (fieldname);
+
+ return 0;
}
-static int parse_btf_arg(const char *varname, struct fetch_insn *code,
+static int parse_btf_arg(char *varname,
+ struct fetch_insn **pcode, struct fetch_insn *end,
struct traceprobe_parse_context *ctx)
{
- struct btf *btf = traceprobe_get_btf();
+ struct fetch_insn *code = *pcode;
const struct btf_param *params;
- int i;
+ const struct btf_type *type;
+ char *field = NULL;
+ int i, is_ptr, ret;
+ u32 tid;
+
+ if (WARN_ON_ONCE(!ctx->funcname))
+ return -EINVAL;
- if (!btf) {
- trace_probe_log_err(ctx->offset, NOSUP_BTFARG);
+ is_ptr = split_next_field(varname, &field, ctx);
+ if (is_ptr < 0)
+ return is_ptr;
+ if (!is_ptr && field) {
+ /* dot-connected field on an argument is not supported. */
+ trace_probe_log_err(ctx->offset + field - varname,
+ NOSUP_DAT_ARG);
return -EOPNOTSUPP;
}
- if (WARN_ON_ONCE(!ctx->funcname))
- return -EINVAL;
+ if (ctx->flags & TPARG_FL_RETURN) {
+ if (strcmp(varname, "$retval") != 0) {
+ trace_probe_log_err(ctx->offset, NO_BTFARG);
+ return -ENOENT;
+ }
+ code->op = FETCH_OP_RETVAL;
+ /* Check whether the function return type is not void */
+ if (query_btf_context(ctx) == 0) {
+ if (ctx->proto->type == 0) {
+ trace_probe_log_err(ctx->offset, NO_RETVAL);
+ return -ENOENT;
+ }
+ tid = ctx->proto->type;
+ goto found;
+ }
+ if (field) {
+ trace_probe_log_err(ctx->offset + field - varname,
+ NO_BTF_ENTRY);
+ return -ENOENT;
+ }
+ return 0;
+ }
- if (!ctx->params) {
- params = find_btf_func_param(ctx->funcname, &ctx->nr_params,
- ctx->flags & TPARG_FL_TPOINT);
- if (IS_ERR_OR_NULL(params)) {
+ if (!ctx->btf) {
+ ret = query_btf_context(ctx);
+ if (ret < 0 || ctx->nr_params == 0) {
trace_probe_log_err(ctx->offset, NO_BTF_ENTRY);
return PTR_ERR(params);
}
- ctx->params = params;
- } else
- params = ctx->params;
+ }
+ params = ctx->params;
for (i = 0; i < ctx->nr_params; i++) {
- const char *name = btf_name_by_offset(btf, params[i].name_off);
+ const char *name = btf_name_by_offset(ctx->btf, params[i].name_off);
if (name && !strcmp(name, varname)) {
code->op = FETCH_OP_ARG;
@@ -460,91 +659,114 @@ static int parse_btf_arg(const char *varname, struct fetch_insn *code,
code->param = i + 1;
else
code->param = i;
- return 0;
+ tid = params[i].type;
+ goto found;
}
}
trace_probe_log_err(ctx->offset, NO_BTFARG);
return -ENOENT;
-}
-
-static const struct fetch_type *parse_btf_arg_type(int arg_idx,
- struct traceprobe_parse_context *ctx)
-{
- struct btf *btf = traceprobe_get_btf();
- const char *typestr = NULL;
- if (btf && ctx->params) {
- if (ctx->flags & TPARG_FL_TPOINT)
- arg_idx--;
- typestr = type_from_btf_id(btf, ctx->params[arg_idx].type);
+found:
+ type = btf_type_skip_modifiers(ctx->btf, tid, &tid);
+ if (!type) {
+ trace_probe_log_err(ctx->offset, BAD_BTF_TID);
+ return -EINVAL;
}
-
- return find_fetch_type(typestr, ctx->flags);
+ /* Initialize the last type information */
+ ctx->last_type = type;
+ ctx->last_bitoffs = 0;
+ ctx->last_bitsize = 0;
+ if (field) {
+ ctx->offset += field - varname;
+ return parse_btf_field(field, type, pcode, end, ctx);
+ }
+ return 0;
}
-static const struct fetch_type *parse_btf_retval_type(
+static const struct fetch_type *find_fetch_type_from_btf_type(
struct traceprobe_parse_context *ctx)
{
- struct btf *btf = traceprobe_get_btf();
+ struct btf *btf = ctx->btf;
const char *typestr = NULL;
- const struct btf_type *t;
- if (btf && ctx->funcname) {
- t = find_btf_func_proto(ctx->funcname);
- if (!IS_ERR(t))
- typestr = type_from_btf_id(btf, t->type);
- }
+ if (btf && ctx->last_type)
+ typestr = fetch_type_from_btf_type(btf, ctx->last_type, ctx);
return find_fetch_type(typestr, ctx->flags);
}
-static bool is_btf_retval_void(const char *funcname)
+static int parse_btf_bitfield(struct fetch_insn **pcode,
+ struct traceprobe_parse_context *ctx)
{
- const struct btf_type *t;
+ struct fetch_insn *code = *pcode;
- t = find_btf_func_proto(funcname);
- if (IS_ERR(t))
- return false;
+ if ((ctx->last_bitsize % 8 == 0) && ctx->last_bitoffs == 0)
+ return 0;
+
+ code++;
+ if (code->op != FETCH_OP_NOP) {
+ trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
+ return -EINVAL;
+ }
+ *pcode = code;
- return t->type == 0;
+ code->op = FETCH_OP_MOD_BF;
+ code->lshift = 64 - (ctx->last_bitsize + ctx->last_bitoffs);
+ code->rshift = 64 - ctx->last_bitsize;
+ code->basesize = 64 / 8;
+ return 0;
}
+
#else
-static struct btf *traceprobe_get_btf(void)
+static void clear_btf_context(struct traceprobe_parse_context *ctx)
{
- return NULL;
+ ctx->btf = NULL;
}
-static const struct btf_param *find_btf_func_param(const char *funcname, s32 *nr,
- bool tracepoint)
+static int query_btf_context(struct traceprobe_parse_context *ctx)
{
- return ERR_PTR(-EOPNOTSUPP);
+ return -EOPNOTSUPP;
}
-static int parse_btf_arg(const char *varname, struct fetch_insn *code,
+static int parse_btf_arg(char *varname,
+ struct fetch_insn **pcode, struct fetch_insn *end,
struct traceprobe_parse_context *ctx)
{
trace_probe_log_err(ctx->offset, NOSUP_BTFARG);
return -EOPNOTSUPP;
}
-#define parse_btf_arg_type(idx, ctx) \
- find_fetch_type(NULL, ctx->flags)
+static int parse_btf_bitfield(struct fetch_insn **pcode,
+ struct traceprobe_parse_context *ctx)
+{
+ trace_probe_log_err(ctx->offset, NOSUP_BTFARG);
+ return -EOPNOTSUPP;
+}
-#define parse_btf_retval_type(ctx) \
+#define find_fetch_type_from_btf_type(ctx) \
find_fetch_type(NULL, ctx->flags)
-#define is_btf_retval_void(funcname) (false)
+static int check_prepare_btf_string_fetch(char *typename,
+ struct fetch_insn **pcode,
+ struct traceprobe_parse_context *ctx)
+{
+ return 0;
+}
#endif
#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
-static int parse_probe_vars(char *arg, const struct fetch_type *t,
- struct fetch_insn *code,
+/* Parse $vars. @orig_arg points '$', which syncs to @ctx->offset */
+static int parse_probe_vars(char *orig_arg, const struct fetch_type *t,
+ struct fetch_insn **pcode,
+ struct fetch_insn *end,
struct traceprobe_parse_context *ctx)
{
- unsigned long param;
+ struct fetch_insn *code = *pcode;
int err = TP_ERR_BAD_VAR;
+ char *arg = orig_arg + 1;
+ unsigned long param;
int ret = 0;
int len;
@@ -563,18 +785,17 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
goto inval;
}
- if (strcmp(arg, "retval") == 0) {
- if (ctx->flags & TPARG_FL_RETURN) {
- if ((ctx->flags & TPARG_FL_KERNEL) &&
- is_btf_retval_void(ctx->funcname)) {
- err = TP_ERR_NO_RETVAL;
- goto inval;
- }
+ if (str_has_prefix(arg, "retval")) {
+ if (!(ctx->flags & TPARG_FL_RETURN)) {
+ err = TP_ERR_RETVAL_ON_PROBE;
+ goto inval;
+ }
+ if (!(ctx->flags & TPARG_FL_KERNEL) ||
+ !IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS)) {
code->op = FETCH_OP_RETVAL;
return 0;
}
- err = TP_ERR_RETVAL_ON_PROBE;
- goto inval;
+ return parse_btf_arg(orig_arg, pcode, end, ctx);
}
len = str_has_prefix(arg, "stack");
@@ -676,7 +897,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
switch (arg[0]) {
case '$':
- ret = parse_probe_vars(arg + 1, type, code, ctx);
+ ret = parse_probe_vars(arg, type, pcode, end, ctx);
break;
case '%': /* named register */
@@ -795,6 +1016,8 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
code->op = deref;
code->offset = offset;
+ /* Reset the last type if used */
+ ctx->last_type = NULL;
}
break;
case '\\': /* Immediate value */
@@ -818,7 +1041,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
trace_probe_log_err(ctx->offset, NOSUP_BTFARG);
return -EINVAL;
}
- ret = parse_btf_arg(arg, code, ctx);
+ ret = parse_btf_arg(arg, pcode, end, ctx);
break;
}
}
@@ -964,17 +1187,22 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
goto out;
code[FETCH_INSN_MAX - 1].op = FETCH_OP_END;
+ ctx->last_type = NULL;
ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1],
ctx);
if (ret)
goto fail;
/* Update storing type if BTF is available */
- if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) && !t) {
- if (code->op == FETCH_OP_ARG)
- parg->type = parse_btf_arg_type(code->param, ctx);
- else if (code->op == FETCH_OP_RETVAL)
- parg->type = parse_btf_retval_type(ctx);
+ if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) &&
+ ctx->last_type) {
+ if (!t) {
+ parg->type = find_fetch_type_from_btf_type(ctx);
+ } else if (strstr(t, "string")) {
+ ret = check_prepare_btf_string_fetch(t, &code, ctx);
+ if (ret)
+ goto fail;
+ }
}
ret = -EINVAL;
@@ -1048,6 +1276,11 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
trace_probe_log_err(ctx->offset + t - arg, BAD_BITFIELD);
goto fail;
}
+ } else if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) &&
+ ctx->last_type) {
+ ret = parse_btf_bitfield(&code, ctx);
+ if (ret)
+ goto fail;
}
ret = -EINVAL;
/* Loop(Array) operation */
@@ -1231,7 +1464,6 @@ static int sprint_nth_btf_arg(int idx, const char *type,
char *buf, int bufsize,
struct traceprobe_parse_context *ctx)
{
- struct btf *btf = traceprobe_get_btf();
const char *name;
int ret;
@@ -1239,7 +1471,7 @@ static int sprint_nth_btf_arg(int idx, const char *type,
trace_probe_log_err(0, NO_BTFARG);
return -ENOENT;
}
- name = btf_name_by_offset(btf, ctx->params[idx].name_off);
+ name = btf_name_by_offset(ctx->btf, ctx->params[idx].name_off);
if (!name) {
trace_probe_log_err(0, NO_BTF_ENTRY);
return -ENOENT;
@@ -1260,7 +1492,6 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[],
const struct btf_param *params = NULL;
int i, j, n, used, ret, args_idx = -1;
const char **new_argv = NULL;
- int nr_params;
ret = argv_has_var_arg(argc, argv, &args_idx, ctx);
if (ret < 0)
@@ -1271,9 +1502,8 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[],
return NULL;
}
- params = find_btf_func_param(ctx->funcname, &nr_params,
- ctx->flags & TPARG_FL_TPOINT);
- if (IS_ERR_OR_NULL(params)) {
+ ret = query_btf_context(ctx);
+ if (ret < 0 || ctx->nr_params == 0) {
if (args_idx != -1) {
/* $arg* requires BTF info */
trace_probe_log_err(0, NOSUP_BTFARG);
@@ -1282,8 +1512,6 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[],
*new_argc = argc;
return NULL;
}
- ctx->params = params;
- ctx->nr_params = nr_params;
if (args_idx >= 0)
*new_argc = argc + ctx->nr_params - 1;
@@ -1298,7 +1526,7 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[],
for (i = 0, j = 0; i < argc; i++) {
trace_probe_log_set_index(i + 2);
if (i == args_idx) {
- for (n = 0; n < nr_params; n++) {
+ for (n = 0; n < ctx->nr_params; n++) {
ret = sprint_nth_btf_arg(n, "", buf + used,
bufsize - used, ctx);
if (ret < 0)
@@ -1337,6 +1565,11 @@ error:
return ERR_PTR(ret);
}
+void traceprobe_finish_parse(struct traceprobe_parse_context *ctx)
+{
+ clear_btf_context(ctx);
+}
+
int traceprobe_update_arg(struct probe_arg *arg)
{
struct fetch_insn *code = arg->code;
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 7dde806be91e..02b432ae7513 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -383,9 +383,15 @@ static inline bool tparg_is_function_entry(unsigned int flags)
struct traceprobe_parse_context {
struct trace_event_call *event;
- const struct btf_param *params;
- s32 nr_params;
- const char *funcname;
+ /* BTF related parameters */
+ const char *funcname; /* Function name in BTF */
+ const struct btf_type *proto; /* Prototype of the function */
+ const struct btf_param *params; /* Parameter of the function */
+ s32 nr_params; /* The number of the parameters */
+ struct btf *btf; /* The BTF to be used */
+ const struct btf_type *last_type; /* Saved type */
+ u32 last_bitoffs; /* Saved bitoffs */
+ u32 last_bitsize; /* Saved bitsize */
unsigned int flags;
int offset;
};
@@ -400,6 +406,12 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[],
extern int traceprobe_update_arg(struct probe_arg *arg);
extern void traceprobe_free_probe_arg(struct probe_arg *arg);
+/*
+ * If either traceprobe_parse_probe_arg() or traceprobe_expand_meta_args() is called,
+ * this MUST be called for clean up the context and return a resource.
+ */
+void traceprobe_finish_parse(struct traceprobe_parse_context *ctx);
+
extern int traceprobe_split_symbol_offset(char *symbol, long *offset);
int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
char *buf, int offset);
@@ -495,7 +507,14 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(BAD_VAR_ARGS, "$arg* must be an independent parameter without name etc."),\
C(NOFENTRY_ARGS, "$arg* can be used only on function entry"), \
C(DOUBLE_ARGS, "$arg* can be used only once in the parameters"), \
- C(ARGS_2LONG, "$arg* failed because the argument list is too long"),
+ C(ARGS_2LONG, "$arg* failed because the argument list is too long"), \
+ C(ARGIDX_2BIG, "$argN index is too big"), \
+ C(NO_PTR_STRCT, "This is not a pointer to union/structure."), \
+ C(NOSUP_DAT_ARG, "Non pointer structure/union argument is not supported."),\
+ C(BAD_HYPHEN, "Failed to parse single hyphen. Forgot '>'?"), \
+ C(NO_BTF_FIELD, "This field is not found."), \
+ C(BAD_BTF_TID, "Failed to get BTF type info."),\
+ C(BAD_TYPE4STR, "This type does not fit for string."),
#undef C
#define C(a, b) TP_ERR_##a
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 576b3bcb8ebd..99c051de412a 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -688,6 +688,7 @@ static int __trace_uprobe_create(int argc, const char **argv)
trace_probe_log_set_index(i + 2);
ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], &ctx);
+ traceprobe_finish_parse(&ctx);
if (ret)
goto error;
}
diff --git a/kernel/ucount.c b/kernel/ucount.c
index ee8e57fd6f90..4aa6166cb856 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -104,7 +104,8 @@ bool setup_userns_sysctls(struct user_namespace *ns)
for (i = 0; i < UCOUNT_COUNTS; i++) {
tbl[i].data = &ns->ucount_max[i];
}
- ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl);
+ ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl,
+ ARRAY_SIZE(user_table));
}
if (!ns->sysctls) {
kfree(tbl);
@@ -364,7 +365,7 @@ static __init int user_namespace_sysctl_init(void)
* default set so that registrations in the child sets work
* properly.
*/
- user_header = register_sysctl("user", empty);
+ user_header = register_sysctl_sz("user", empty, 0);
kmemleak_ignore(user_header);
BUG_ON(!user_header);
BUG_ON(!setup_userns_sysctls(&init_user_ns));
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index be38276a365f..d145305d95fe 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -151,9 +151,6 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
*/
if (is_hardlockup(cpu)) {
unsigned int this_cpu = smp_processor_id();
- struct cpumask backtrace_mask;
-
- cpumask_copy(&backtrace_mask, cpu_online_mask);
/* Only print hardlockups once. */
if (per_cpu(watchdog_hardlockup_warned, cpu))
@@ -167,10 +164,8 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
show_regs(regs);
else
dump_stack();
- cpumask_clear_cpu(cpu, &backtrace_mask);
} else {
- if (trigger_single_cpu_backtrace(cpu))
- cpumask_clear_cpu(cpu, &backtrace_mask);
+ trigger_single_cpu_backtrace(cpu);
}
/*
@@ -179,7 +174,7 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
*/
if (sysctl_hardlockup_all_cpu_backtrace &&
!test_and_set_bit(0, &watchdog_hardlockup_all_cpu_dumped))
- trigger_cpumask_backtrace(&backtrace_mask);
+ trigger_allbutcpu_cpu_backtrace(cpu);
if (hardlockup_panic)
nmi_panic(regs, "Hard LOCKUP");
@@ -523,7 +518,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
dump_stack();
if (softlockup_all_cpu_backtrace) {
- trigger_allbutself_cpu_backtrace();
+ trigger_allbutcpu_cpu_backtrace(smp_processor_id());
clear_bit_unlock(0, &soft_lockup_nmi_warn);
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 800b4208dba9..c85825e17df8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -122,11 +122,6 @@ enum {
*
* L: pool->lock protected. Access with pool->lock held.
*
- * X: During normal operation, modification requires pool->lock and should
- * be done only from local cpu. Either disabling preemption on local
- * cpu or grabbing pool->lock is enough for read access. If
- * POOL_DISASSOCIATED is set, it's identical to L.
- *
* K: Only modified by worker while holding pool->lock. Can be safely read by
* self, while holding pool->lock or from IRQ context if %current is the
* kworker.
@@ -160,7 +155,7 @@ struct worker_pool {
int cpu; /* I: the associated cpu */
int node; /* I: the associated node ID */
int id; /* I: pool ID */
- unsigned int flags; /* X: flags */
+ unsigned int flags; /* L: flags */
unsigned long watchdog_ts; /* L: watchdog timestamp */
bool cpu_stall; /* WD: stalled cpu bound pool */
@@ -216,6 +211,7 @@ enum pool_workqueue_stats {
PWQ_STAT_CPU_TIME, /* total CPU time consumed */
PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */
PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */
+ PWQ_STAT_REPATRIATED, /* unbound workers brought back into scope */
PWQ_STAT_MAYDAY, /* maydays to rescuer */
PWQ_STAT_RESCUED, /* linked work items executed by rescuer */
@@ -262,12 +258,12 @@ struct pool_workqueue {
u64 stats[PWQ_NR_STATS];
/*
- * Release of unbound pwq is punted to system_wq. See put_pwq()
- * and pwq_unbound_release_workfn() for details. pool_workqueue
- * itself is also RCU protected so that the first pwq can be
- * determined without grabbing wq->mutex.
+ * Release of unbound pwq is punted to a kthread_worker. See put_pwq()
+ * and pwq_release_workfn() for details. pool_workqueue itself is also
+ * RCU protected so that the first pwq can be determined without
+ * grabbing wq->mutex.
*/
- struct work_struct unbound_release_work;
+ struct kthread_work release_work;
struct rcu_head rcu;
} __aligned(1 << WORK_STRUCT_FLAG_BITS);
@@ -326,14 +322,33 @@ struct workqueue_struct {
/* hot fields used during command issue, aligned to cacheline */
unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
- struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
- struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */
+ struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */
};
static struct kmem_cache *pwq_cache;
-static cpumask_var_t *wq_numa_possible_cpumask;
- /* possible CPUs of each node */
+/*
+ * Each pod type describes how CPUs should be grouped for unbound workqueues.
+ * See the comment above workqueue_attrs->affn_scope.
+ */
+struct wq_pod_type {
+ int nr_pods; /* number of pods */
+ cpumask_var_t *pod_cpus; /* pod -> cpus */
+ int *pod_node; /* pod -> node */
+ int *cpu_pod; /* cpu -> pod */
+};
+
+static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES];
+static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE;
+
+static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = {
+ [WQ_AFFN_DFL] = "default",
+ [WQ_AFFN_CPU] = "cpu",
+ [WQ_AFFN_SMT] = "smt",
+ [WQ_AFFN_CACHE] = "cache",
+ [WQ_AFFN_NUMA] = "numa",
+ [WQ_AFFN_SYSTEM] = "system",
+};
/*
* Per-cpu work items which run for longer than the following threshold are
@@ -345,19 +360,14 @@ static cpumask_var_t *wq_numa_possible_cpumask;
static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX;
module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644);
-static bool wq_disable_numa;
-module_param_named(disable_numa, wq_disable_numa, bool, 0444);
-
/* see the comment above the definition of WQ_POWER_EFFICIENT */
static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
module_param_named(power_efficient, wq_power_efficient, bool, 0444);
static bool wq_online; /* can kworkers be created yet? */
-static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
-
-/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
-static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
+/* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */
+static struct workqueue_attrs *wq_update_pod_attrs_buf;
static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
@@ -371,6 +381,9 @@ static bool workqueue_freezing; /* PL: have wqs started freezing? */
/* PL&A: allowable cpus for unbound wqs and work items */
static cpumask_var_t wq_unbound_cpumask;
+/* for further constrain wq_unbound_cpumask by cmdline parameter*/
+static struct cpumask wq_cmdline_cpumask __initdata;
+
/* CPU where unbound work was last round robin scheduled from this CPU */
static DEFINE_PER_CPU(int, wq_rr_cpu_last);
@@ -400,6 +413,13 @@ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
/* I: attributes used when instantiating ordered pools on demand */
static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
+/*
+ * I: kthread_worker to release pwq's. pwq release needs to be bounced to a
+ * process context while holding a pool lock. Bounce to a dedicated kthread
+ * worker to avoid A-A deadlocks.
+ */
+static struct kthread_worker *pwq_release_worker;
+
struct workqueue_struct *system_wq __read_mostly;
EXPORT_SYMBOL(system_wq);
struct workqueue_struct *system_highpri_wq __read_mostly;
@@ -606,35 +626,6 @@ static int worker_pool_assign_id(struct worker_pool *pool)
return ret;
}
-/**
- * unbound_pwq_by_node - return the unbound pool_workqueue for the given node
- * @wq: the target workqueue
- * @node: the node ID
- *
- * This must be called with any of wq_pool_mutex, wq->mutex or RCU
- * read locked.
- * If the pwq needs to be used beyond the locking in effect, the caller is
- * responsible for guaranteeing that the pwq stays online.
- *
- * Return: The unbound pool_workqueue for @node.
- */
-static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
- int node)
-{
- assert_rcu_or_wq_mutex_or_pool_mutex(wq);
-
- /*
- * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a
- * delayed item is pending. The plan is to keep CPU -> NODE
- * mapping valid and stable across CPU on/offlines. Once that
- * happens, this workaround can be removed.
- */
- if (unlikely(node == NUMA_NO_NODE))
- return wq->dfl_pwq;
-
- return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
-}
-
static unsigned int work_color_to_flags(int color)
{
return color << WORK_STRUCT_COLOR_SHIFT;
@@ -825,11 +816,6 @@ static bool work_is_canceling(struct work_struct *work)
* they're being called with pool->lock held.
*/
-static bool __need_more_worker(struct worker_pool *pool)
-{
- return !pool->nr_running;
-}
-
/*
* Need to wake up a worker? Called from anything but currently
* running workers.
@@ -840,7 +826,7 @@ static bool __need_more_worker(struct worker_pool *pool)
*/
static bool need_more_worker(struct worker_pool *pool)
{
- return !list_empty(&pool->worklist) && __need_more_worker(pool);
+ return !list_empty(&pool->worklist) && !pool->nr_running;
}
/* Can I start working? Called from busy but !running workers. */
@@ -871,51 +857,18 @@ static bool too_many_workers(struct worker_pool *pool)
return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
}
-/*
- * Wake up functions.
- */
-
-/* Return the first idle worker. Called with pool->lock held. */
-static struct worker *first_idle_worker(struct worker_pool *pool)
-{
- if (unlikely(list_empty(&pool->idle_list)))
- return NULL;
-
- return list_first_entry(&pool->idle_list, struct worker, entry);
-}
-
-/**
- * wake_up_worker - wake up an idle worker
- * @pool: worker pool to wake worker from
- *
- * Wake up the first idle worker of @pool.
- *
- * CONTEXT:
- * raw_spin_lock_irq(pool->lock).
- */
-static void wake_up_worker(struct worker_pool *pool)
-{
- struct worker *worker = first_idle_worker(pool);
-
- if (likely(worker))
- wake_up_process(worker->task);
-}
-
/**
* worker_set_flags - set worker flags and adjust nr_running accordingly
* @worker: self
* @flags: flags to set
*
* Set @flags in @worker->flags and adjust nr_running accordingly.
- *
- * CONTEXT:
- * raw_spin_lock_irq(pool->lock)
*/
static inline void worker_set_flags(struct worker *worker, unsigned int flags)
{
struct worker_pool *pool = worker->pool;
- WARN_ON_ONCE(worker->task != current);
+ lockdep_assert_held(&pool->lock);
/* If transitioning into NOT_RUNNING, adjust nr_running. */
if ((flags & WORKER_NOT_RUNNING) &&
@@ -932,16 +885,13 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags)
* @flags: flags to clear
*
* Clear @flags in @worker->flags and adjust nr_running accordingly.
- *
- * CONTEXT:
- * raw_spin_lock_irq(pool->lock)
*/
static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
{
struct worker_pool *pool = worker->pool;
unsigned int oflags = worker->flags;
- WARN_ON_ONCE(worker->task != current);
+ lockdep_assert_held(&pool->lock);
worker->flags &= ~flags;
@@ -955,6 +905,244 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
pool->nr_running++;
}
+/* Return the first idle worker. Called with pool->lock held. */
+static struct worker *first_idle_worker(struct worker_pool *pool)
+{
+ if (unlikely(list_empty(&pool->idle_list)))
+ return NULL;
+
+ return list_first_entry(&pool->idle_list, struct worker, entry);
+}
+
+/**
+ * worker_enter_idle - enter idle state
+ * @worker: worker which is entering idle state
+ *
+ * @worker is entering idle state. Update stats and idle timer if
+ * necessary.
+ *
+ * LOCKING:
+ * raw_spin_lock_irq(pool->lock).
+ */
+static void worker_enter_idle(struct worker *worker)
+{
+ struct worker_pool *pool = worker->pool;
+
+ if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
+ WARN_ON_ONCE(!list_empty(&worker->entry) &&
+ (worker->hentry.next || worker->hentry.pprev)))
+ return;
+
+ /* can't use worker_set_flags(), also called from create_worker() */
+ worker->flags |= WORKER_IDLE;
+ pool->nr_idle++;
+ worker->last_active = jiffies;
+
+ /* idle_list is LIFO */
+ list_add(&worker->entry, &pool->idle_list);
+
+ if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
+ mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
+
+ /* Sanity check nr_running. */
+ WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running);
+}
+
+/**
+ * worker_leave_idle - leave idle state
+ * @worker: worker which is leaving idle state
+ *
+ * @worker is leaving idle state. Update stats.
+ *
+ * LOCKING:
+ * raw_spin_lock_irq(pool->lock).
+ */
+static void worker_leave_idle(struct worker *worker)
+{
+ struct worker_pool *pool = worker->pool;
+
+ if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
+ return;
+ worker_clr_flags(worker, WORKER_IDLE);
+ pool->nr_idle--;
+ list_del_init(&worker->entry);
+}
+
+/**
+ * find_worker_executing_work - find worker which is executing a work
+ * @pool: pool of interest
+ * @work: work to find worker for
+ *
+ * Find a worker which is executing @work on @pool by searching
+ * @pool->busy_hash which is keyed by the address of @work. For a worker
+ * to match, its current execution should match the address of @work and
+ * its work function. This is to avoid unwanted dependency between
+ * unrelated work executions through a work item being recycled while still
+ * being executed.
+ *
+ * This is a bit tricky. A work item may be freed once its execution
+ * starts and nothing prevents the freed area from being recycled for
+ * another work item. If the same work item address ends up being reused
+ * before the original execution finishes, workqueue will identify the
+ * recycled work item as currently executing and make it wait until the
+ * current execution finishes, introducing an unwanted dependency.
+ *
+ * This function checks the work item address and work function to avoid
+ * false positives. Note that this isn't complete as one may construct a
+ * work function which can introduce dependency onto itself through a
+ * recycled work item. Well, if somebody wants to shoot oneself in the
+ * foot that badly, there's only so much we can do, and if such deadlock
+ * actually occurs, it should be easy to locate the culprit work function.
+ *
+ * CONTEXT:
+ * raw_spin_lock_irq(pool->lock).
+ *
+ * Return:
+ * Pointer to worker which is executing @work if found, %NULL
+ * otherwise.
+ */
+static struct worker *find_worker_executing_work(struct worker_pool *pool,
+ struct work_struct *work)
+{
+ struct worker *worker;
+
+ hash_for_each_possible(pool->busy_hash, worker, hentry,
+ (unsigned long)work)
+ if (worker->current_work == work &&
+ worker->current_func == work->func)
+ return worker;
+
+ return NULL;
+}
+
+/**
+ * move_linked_works - move linked works to a list
+ * @work: start of series of works to be scheduled
+ * @head: target list to append @work to
+ * @nextp: out parameter for nested worklist walking
+ *
+ * Schedule linked works starting from @work to @head. Work series to be
+ * scheduled starts at @work and includes any consecutive work with
+ * WORK_STRUCT_LINKED set in its predecessor. See assign_work() for details on
+ * @nextp.
+ *
+ * CONTEXT:
+ * raw_spin_lock_irq(pool->lock).
+ */
+static void move_linked_works(struct work_struct *work, struct list_head *head,
+ struct work_struct **nextp)
+{
+ struct work_struct *n;
+
+ /*
+ * Linked worklist will always end before the end of the list,
+ * use NULL for list head.
+ */
+ list_for_each_entry_safe_from(work, n, NULL, entry) {
+ list_move_tail(&work->entry, head);
+ if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
+ break;
+ }
+
+ /*
+ * If we're already inside safe list traversal and have moved
+ * multiple works to the scheduled queue, the next position
+ * needs to be updated.
+ */
+ if (nextp)
+ *nextp = n;
+}
+
+/**
+ * assign_work - assign a work item and its linked work items to a worker
+ * @work: work to assign
+ * @worker: worker to assign to
+ * @nextp: out parameter for nested worklist walking
+ *
+ * Assign @work and its linked work items to @worker. If @work is already being
+ * executed by another worker in the same pool, it'll be punted there.
+ *
+ * If @nextp is not NULL, it's updated to point to the next work of the last
+ * scheduled work. This allows assign_work() to be nested inside
+ * list_for_each_entry_safe().
+ *
+ * Returns %true if @work was successfully assigned to @worker. %false if @work
+ * was punted to another worker already executing it.
+ */
+static bool assign_work(struct work_struct *work, struct worker *worker,
+ struct work_struct **nextp)
+{
+ struct worker_pool *pool = worker->pool;
+ struct worker *collision;
+
+ lockdep_assert_held(&pool->lock);
+
+ /*
+ * A single work shouldn't be executed concurrently by multiple workers.
+ * __queue_work() ensures that @work doesn't jump to a different pool
+ * while still running in the previous pool. Here, we should ensure that
+ * @work is not executed concurrently by multiple workers from the same
+ * pool. Check whether anyone is already processing the work. If so,
+ * defer the work to the currently executing one.
+ */
+ collision = find_worker_executing_work(pool, work);
+ if (unlikely(collision)) {
+ move_linked_works(work, &collision->scheduled, nextp);
+ return false;
+ }
+
+ move_linked_works(work, &worker->scheduled, nextp);
+ return true;
+}
+
+/**
+ * kick_pool - wake up an idle worker if necessary
+ * @pool: pool to kick
+ *
+ * @pool may have pending work items. Wake up worker if necessary. Returns
+ * whether a worker was woken up.
+ */
+static bool kick_pool(struct worker_pool *pool)
+{
+ struct worker *worker = first_idle_worker(pool);
+ struct task_struct *p;
+
+ lockdep_assert_held(&pool->lock);
+
+ if (!need_more_worker(pool) || !worker)
+ return false;
+
+ p = worker->task;
+
+#ifdef CONFIG_SMP
+ /*
+ * Idle @worker is about to execute @work and waking up provides an
+ * opportunity to migrate @worker at a lower cost by setting the task's
+ * wake_cpu field. Let's see if we want to move @worker to improve
+ * execution locality.
+ *
+ * We're waking the worker that went idle the latest and there's some
+ * chance that @worker is marked idle but hasn't gone off CPU yet. If
+ * so, setting the wake_cpu won't do anything. As this is a best-effort
+ * optimization and the race window is narrow, let's leave as-is for
+ * now. If this becomes pronounced, we can skip over workers which are
+ * still on cpu when picking an idle worker.
+ *
+ * If @pool has non-strict affinity, @worker might have ended up outside
+ * its affinity scope. Repatriate.
+ */
+ if (!pool->attrs->affn_strict &&
+ !cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) {
+ struct work_struct *work = list_first_entry(&pool->worklist,
+ struct work_struct, entry);
+ p->wake_cpu = cpumask_any_distribute(pool->attrs->__pod_cpumask);
+ get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++;
+ }
+#endif
+ wake_up_process(p);
+ return true;
+}
+
#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT
/*
@@ -1120,10 +1308,9 @@ void wq_worker_sleeping(struct task_struct *task)
}
pool->nr_running--;
- if (need_more_worker(pool)) {
+ if (kick_pool(pool))
worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++;
- wake_up_worker(pool);
- }
+
raw_spin_unlock_irq(&pool->lock);
}
@@ -1171,10 +1358,8 @@ void wq_worker_tick(struct task_struct *task)
wq_cpu_intensive_report(worker->current_func);
pwq->stats[PWQ_STAT_CPU_INTENSIVE]++;
- if (need_more_worker(pool)) {
+ if (kick_pool(pool))
pwq->stats[PWQ_STAT_CM_WAKEUP]++;
- wake_up_worker(pool);
- }
raw_spin_unlock(&pool->lock);
}
@@ -1211,94 +1396,6 @@ work_func_t wq_worker_last_func(struct task_struct *task)
}
/**
- * find_worker_executing_work - find worker which is executing a work
- * @pool: pool of interest
- * @work: work to find worker for
- *
- * Find a worker which is executing @work on @pool by searching
- * @pool->busy_hash which is keyed by the address of @work. For a worker
- * to match, its current execution should match the address of @work and
- * its work function. This is to avoid unwanted dependency between
- * unrelated work executions through a work item being recycled while still
- * being executed.
- *
- * This is a bit tricky. A work item may be freed once its execution
- * starts and nothing prevents the freed area from being recycled for
- * another work item. If the same work item address ends up being reused
- * before the original execution finishes, workqueue will identify the
- * recycled work item as currently executing and make it wait until the
- * current execution finishes, introducing an unwanted dependency.
- *
- * This function checks the work item address and work function to avoid
- * false positives. Note that this isn't complete as one may construct a
- * work function which can introduce dependency onto itself through a
- * recycled work item. Well, if somebody wants to shoot oneself in the
- * foot that badly, there's only so much we can do, and if such deadlock
- * actually occurs, it should be easy to locate the culprit work function.
- *
- * CONTEXT:
- * raw_spin_lock_irq(pool->lock).
- *
- * Return:
- * Pointer to worker which is executing @work if found, %NULL
- * otherwise.
- */
-static struct worker *find_worker_executing_work(struct worker_pool *pool,
- struct work_struct *work)
-{
- struct worker *worker;
-
- hash_for_each_possible(pool->busy_hash, worker, hentry,
- (unsigned long)work)
- if (worker->current_work == work &&
- worker->current_func == work->func)
- return worker;
-
- return NULL;
-}
-
-/**
- * move_linked_works - move linked works to a list
- * @work: start of series of works to be scheduled
- * @head: target list to append @work to
- * @nextp: out parameter for nested worklist walking
- *
- * Schedule linked works starting from @work to @head. Work series to
- * be scheduled starts at @work and includes any consecutive work with
- * WORK_STRUCT_LINKED set in its predecessor.
- *
- * If @nextp is not NULL, it's updated to point to the next work of
- * the last scheduled work. This allows move_linked_works() to be
- * nested inside outer list_for_each_entry_safe().
- *
- * CONTEXT:
- * raw_spin_lock_irq(pool->lock).
- */
-static void move_linked_works(struct work_struct *work, struct list_head *head,
- struct work_struct **nextp)
-{
- struct work_struct *n;
-
- /*
- * Linked worklist will always end before the end of the list,
- * use NULL for list head.
- */
- list_for_each_entry_safe_from(work, n, NULL, entry) {
- list_move_tail(&work->entry, head);
- if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
- break;
- }
-
- /*
- * If we're already inside safe list traversal and have moved
- * multiple works to the scheduled queue, the next position
- * needs to be updated.
- */
- if (nextp)
- *nextp = n;
-}
-
-/**
* get_pwq - get an extra reference on the specified pool_workqueue
* @pwq: pool_workqueue to get
*
@@ -1324,17 +1421,11 @@ static void put_pwq(struct pool_workqueue *pwq)
lockdep_assert_held(&pwq->pool->lock);
if (likely(--pwq->refcnt))
return;
- if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND)))
- return;
/*
- * @pwq can't be released under pool->lock, bounce to
- * pwq_unbound_release_workfn(). This never recurses on the same
- * pool->lock as this path is taken only for unbound workqueues and
- * the release work item is scheduled on a per-cpu workqueue. To
- * avoid lockdep warning, unbound pool->locks are given lockdep
- * subclass of 1 in get_unbound_pool().
+ * @pwq can't be released under pool->lock, bounce to a dedicated
+ * kthread_worker to avoid A-A deadlocks.
*/
- schedule_work(&pwq->unbound_release_work);
+ kthread_queue_work(pwq_release_worker, &pwq->release_work);
}
/**
@@ -1550,7 +1641,7 @@ fail:
static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
struct list_head *head, unsigned int extra_flags)
{
- struct worker_pool *pool = pwq->pool;
+ debug_work_activate(work);
/* record the work call stack in order to print it in KASAN reports */
kasan_record_aux_stack_noalloc(work);
@@ -1559,9 +1650,6 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
set_work_pwq(work, pwq, extra_flags);
list_add_tail(&work->entry, head);
get_pwq(pwq);
-
- if (__need_more_worker(pool))
- wake_up_worker(pool);
}
/*
@@ -1615,8 +1703,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
struct work_struct *work)
{
struct pool_workqueue *pwq;
- struct worker_pool *last_pool;
- struct list_head *worklist;
+ struct worker_pool *last_pool, *pool;
unsigned int work_flags;
unsigned int req_cpu = cpu;
@@ -1640,23 +1727,23 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
rcu_read_lock();
retry:
/* pwq which will be used unless @work is executing elsewhere */
- if (wq->flags & WQ_UNBOUND) {
- if (req_cpu == WORK_CPU_UNBOUND)
+ if (req_cpu == WORK_CPU_UNBOUND) {
+ if (wq->flags & WQ_UNBOUND)
cpu = wq_select_unbound_cpu(raw_smp_processor_id());
- pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
- } else {
- if (req_cpu == WORK_CPU_UNBOUND)
+ else
cpu = raw_smp_processor_id();
- pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
}
+ pwq = rcu_dereference(*per_cpu_ptr(wq->cpu_pwq, cpu));
+ pool = pwq->pool;
+
/*
* If @work was previously on a different pool, it might still be
* running there, in which case the work needs to be queued on that
* pool to guarantee non-reentrancy.
*/
last_pool = get_work_pool(work);
- if (last_pool && last_pool != pwq->pool) {
+ if (last_pool && last_pool != pool) {
struct worker *worker;
raw_spin_lock(&last_pool->lock);
@@ -1665,26 +1752,27 @@ retry:
if (worker && worker->current_pwq->wq == wq) {
pwq = worker->current_pwq;
+ pool = pwq->pool;
+ WARN_ON_ONCE(pool != last_pool);
} else {
/* meh... not running there, queue here */
raw_spin_unlock(&last_pool->lock);
- raw_spin_lock(&pwq->pool->lock);
+ raw_spin_lock(&pool->lock);
}
} else {
- raw_spin_lock(&pwq->pool->lock);
+ raw_spin_lock(&pool->lock);
}
/*
- * pwq is determined and locked. For unbound pools, we could have
- * raced with pwq release and it could already be dead. If its
- * refcnt is zero, repeat pwq selection. Note that pwqs never die
- * without another pwq replacing it in the numa_pwq_tbl or while
- * work items are executing on it, so the retrying is guaranteed to
- * make forward-progress.
+ * pwq is determined and locked. For unbound pools, we could have raced
+ * with pwq release and it could already be dead. If its refcnt is zero,
+ * repeat pwq selection. Note that unbound pwqs never die without
+ * another pwq replacing it in cpu_pwq or while work items are executing
+ * on it, so the retrying is guaranteed to make forward-progress.
*/
if (unlikely(!pwq->refcnt)) {
if (wq->flags & WQ_UNBOUND) {
- raw_spin_unlock(&pwq->pool->lock);
+ raw_spin_unlock(&pool->lock);
cpu_relax();
goto retry;
}
@@ -1703,21 +1791,20 @@ retry:
work_flags = work_color_to_flags(pwq->work_color);
if (likely(pwq->nr_active < pwq->max_active)) {
+ if (list_empty(&pool->worklist))
+ pool->watchdog_ts = jiffies;
+
trace_workqueue_activate_work(work);
pwq->nr_active++;
- worklist = &pwq->pool->worklist;
- if (list_empty(worklist))
- pwq->pool->watchdog_ts = jiffies;
+ insert_work(pwq, work, &pool->worklist, work_flags);
+ kick_pool(pool);
} else {
work_flags |= WORK_STRUCT_INACTIVE;
- worklist = &pwq->inactive_works;
+ insert_work(pwq, work, &pwq->inactive_works, work_flags);
}
- debug_work_activate(work);
- insert_work(pwq, work, worklist, work_flags);
-
out:
- raw_spin_unlock(&pwq->pool->lock);
+ raw_spin_unlock(&pool->lock);
rcu_read_unlock();
}
@@ -1754,7 +1841,7 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
EXPORT_SYMBOL(queue_work_on);
/**
- * workqueue_select_cpu_near - Select a CPU based on NUMA node
+ * select_numa_node_cpu - Select a CPU based on NUMA node
* @node: NUMA node ID that we want to select a CPU from
*
* This function will attempt to find a "random" cpu available on a given
@@ -1762,14 +1849,10 @@ EXPORT_SYMBOL(queue_work_on);
* WORK_CPU_UNBOUND indicating that we should just schedule to any
* available CPU if we need to schedule this work.
*/
-static int workqueue_select_cpu_near(int node)
+static int select_numa_node_cpu(int node)
{
int cpu;
- /* No point in doing this if NUMA isn't enabled for workqueues */
- if (!wq_numa_enabled)
- return WORK_CPU_UNBOUND;
-
/* Delay binding to CPU if node is not valid or online */
if (node < 0 || node >= MAX_NUMNODES || !node_online(node))
return WORK_CPU_UNBOUND;
@@ -1826,7 +1909,7 @@ bool queue_work_node(int node, struct workqueue_struct *wq,
local_irq_save(flags);
if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
- int cpu = workqueue_select_cpu_near(node);
+ int cpu = select_numa_node_cpu(node);
__queue_work(cpu, wq, work);
ret = true;
@@ -1981,60 +2064,6 @@ bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork)
}
EXPORT_SYMBOL(queue_rcu_work);
-/**
- * worker_enter_idle - enter idle state
- * @worker: worker which is entering idle state
- *
- * @worker is entering idle state. Update stats and idle timer if
- * necessary.
- *
- * LOCKING:
- * raw_spin_lock_irq(pool->lock).
- */
-static void worker_enter_idle(struct worker *worker)
-{
- struct worker_pool *pool = worker->pool;
-
- if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
- WARN_ON_ONCE(!list_empty(&worker->entry) &&
- (worker->hentry.next || worker->hentry.pprev)))
- return;
-
- /* can't use worker_set_flags(), also called from create_worker() */
- worker->flags |= WORKER_IDLE;
- pool->nr_idle++;
- worker->last_active = jiffies;
-
- /* idle_list is LIFO */
- list_add(&worker->entry, &pool->idle_list);
-
- if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
- mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
-
- /* Sanity check nr_running. */
- WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running);
-}
-
-/**
- * worker_leave_idle - leave idle state
- * @worker: worker which is leaving idle state
- *
- * @worker is leaving idle state. Update stats.
- *
- * LOCKING:
- * raw_spin_lock_irq(pool->lock).
- */
-static void worker_leave_idle(struct worker *worker)
-{
- struct worker_pool *pool = worker->pool;
-
- if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
- return;
- worker_clr_flags(worker, WORKER_IDLE);
- pool->nr_idle--;
- list_del_init(&worker->entry);
-}
-
static struct worker *alloc_worker(int node)
{
struct worker *worker;
@@ -2050,6 +2079,14 @@ static struct worker *alloc_worker(int node)
return worker;
}
+static cpumask_t *pool_allowed_cpus(struct worker_pool *pool)
+{
+ if (pool->cpu < 0 && pool->attrs->affn_strict)
+ return pool->attrs->__pod_cpumask;
+ else
+ return pool->attrs->cpumask;
+}
+
/**
* worker_attach_to_pool() - attach a worker to a pool
* @worker: worker to be attached
@@ -2075,7 +2112,7 @@ static void worker_attach_to_pool(struct worker *worker,
kthread_set_per_cpu(worker->task, pool->cpu);
if (worker->rescue_wq)
- set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
+ set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool));
list_add_tail(&worker->node, &pool->workers);
worker->pool = pool;
@@ -2167,16 +2204,25 @@ static struct worker *create_worker(struct worker_pool *pool)
}
set_user_nice(worker->task, pool->attrs->nice);
- kthread_bind_mask(worker->task, pool->attrs->cpumask);
+ kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
/* successful, attach the worker to the pool */
worker_attach_to_pool(worker, pool);
/* start the newly created worker */
raw_spin_lock_irq(&pool->lock);
+
worker->pool->nr_workers++;
worker_enter_idle(worker);
+ kick_pool(pool);
+
+ /*
+ * @worker is waiting on a completion in kthread() and will trigger hung
+ * check if not woken up soon. As kick_pool() might not have waken it
+ * up, wake it up explicitly once more.
+ */
wake_up_process(worker->task);
+
raw_spin_unlock_irq(&pool->lock);
return worker;
@@ -2304,9 +2350,8 @@ static void idle_worker_timeout(struct timer_list *t)
static void idle_cull_fn(struct work_struct *work)
{
struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work);
- struct list_head cull_list;
+ LIST_HEAD(cull_list);
- INIT_LIST_HEAD(&cull_list);
/*
* Grabbing wq_pool_attach_mutex here ensures an already-running worker
* cannot proceed beyong worker_detach_from_pool() in its self-destruct
@@ -2495,7 +2540,6 @@ __acquires(&pool->lock)
struct pool_workqueue *pwq = get_work_pwq(work);
struct worker_pool *pool = worker->pool;
unsigned long work_data;
- struct worker *collision;
#ifdef CONFIG_LOCKDEP
/*
* It is permissible to free the struct work_struct from
@@ -2512,18 +2556,6 @@ __acquires(&pool->lock)
WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
raw_smp_processor_id() != pool->cpu);
- /*
- * A single work shouldn't be executed concurrently by
- * multiple workers on a single cpu. Check whether anyone is
- * already processing the work. If so, defer the work to the
- * currently executing one.
- */
- collision = find_worker_executing_work(pool, work);
- if (unlikely(collision)) {
- move_linked_works(work, &collision->scheduled, NULL);
- return;
- }
-
/* claim and dequeue */
debug_work_deactivate(work);
hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
@@ -2552,14 +2584,12 @@ __acquires(&pool->lock)
worker_set_flags(worker, WORKER_CPU_INTENSIVE);
/*
- * Wake up another worker if necessary. The condition is always
- * false for normal per-cpu workers since nr_running would always
- * be >= 1 at this point. This is used to chain execution of the
- * pending work items for WORKER_NOT_RUNNING workers such as the
- * UNBOUND and CPU_INTENSIVE ones.
+ * Kick @pool if necessary. It's always noop for per-cpu worker pools
+ * since nr_running would always be >= 1 at this point. This is used to
+ * chain execution of the pending work items for WORKER_NOT_RUNNING
+ * workers such as the UNBOUND and CPU_INTENSIVE ones.
*/
- if (need_more_worker(pool))
- wake_up_worker(pool);
+ kick_pool(pool);
/*
* Record the last pool and clear PENDING which should be the last
@@ -2569,6 +2599,7 @@ __acquires(&pool->lock)
*/
set_work_pool_and_clear_pending(work, pool->id);
+ pwq->stats[PWQ_STAT_STARTED]++;
raw_spin_unlock_irq(&pool->lock);
lock_map_acquire(&pwq->wq->lockdep_map);
@@ -2595,7 +2626,6 @@ __acquires(&pool->lock)
* workqueues), so hiding them isn't a problem.
*/
lockdep_invariant_state(true);
- pwq->stats[PWQ_STAT_STARTED]++;
trace_workqueue_execute_start(work);
worker->current_func(work);
/*
@@ -2661,9 +2691,15 @@ __acquires(&pool->lock)
*/
static void process_scheduled_works(struct worker *worker)
{
- while (!list_empty(&worker->scheduled)) {
- struct work_struct *work = list_first_entry(&worker->scheduled,
- struct work_struct, entry);
+ struct work_struct *work;
+ bool first = true;
+
+ while ((work = list_first_entry_or_null(&worker->scheduled,
+ struct work_struct, entry))) {
+ if (first) {
+ worker->pool->watchdog_ts = jiffies;
+ first = false;
+ }
process_one_work(worker, work);
}
}
@@ -2744,17 +2780,8 @@ recheck:
list_first_entry(&pool->worklist,
struct work_struct, entry);
- pool->watchdog_ts = jiffies;
-
- if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
- /* optimization path, not strictly necessary */
- process_one_work(worker, work);
- if (unlikely(!list_empty(&worker->scheduled)))
- process_scheduled_works(worker);
- } else {
- move_linked_works(work, &worker->scheduled, NULL);
+ if (assign_work(work, worker, NULL))
process_scheduled_works(worker);
- }
} while (keep_working(pool));
worker_set_flags(worker, WORKER_PREP);
@@ -2798,7 +2825,6 @@ static int rescuer_thread(void *__rescuer)
{
struct worker *rescuer = __rescuer;
struct workqueue_struct *wq = rescuer->rescue_wq;
- struct list_head *scheduled = &rescuer->scheduled;
bool should_stop;
set_user_nice(current, RESCUER_NICE_LEVEL);
@@ -2829,7 +2855,6 @@ repeat:
struct pool_workqueue, mayday_node);
struct worker_pool *pool = pwq->pool;
struct work_struct *work, *n;
- bool first = true;
__set_current_state(TASK_RUNNING);
list_del_init(&pwq->mayday_node);
@@ -2844,18 +2869,14 @@ repeat:
* Slurp in all works issued via this workqueue and
* process'em.
*/
- WARN_ON_ONCE(!list_empty(scheduled));
+ WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
list_for_each_entry_safe(work, n, &pool->worklist, entry) {
- if (get_work_pwq(work) == pwq) {
- if (first)
- pool->watchdog_ts = jiffies;
- move_linked_works(work, scheduled, &n);
+ if (get_work_pwq(work) == pwq &&
+ assign_work(work, rescuer, &n))
pwq->stats[PWQ_STAT_RESCUED]++;
- }
- first = false;
}
- if (!list_empty(scheduled)) {
+ if (!list_empty(&rescuer->scheduled)) {
process_scheduled_works(rescuer);
/*
@@ -2888,12 +2909,10 @@ repeat:
put_pwq(pwq);
/*
- * Leave this pool. If need_more_worker() is %true, notify a
- * regular worker; otherwise, we end up with 0 concurrency
- * and stalling the execution.
+ * Leave this pool. Notify regular workers; otherwise, we end up
+ * with 0 concurrency and stalling the execution.
*/
- if (need_more_worker(pool))
- wake_up_worker(pool);
+ kick_pool(pool);
raw_spin_unlock_irq(&pool->lock);
@@ -3028,7 +3047,6 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
pwq->nr_in_flight[work_color]++;
work_flags |= work_color_to_flags(work_color);
- debug_work_activate(&barr->work);
insert_work(pwq, &barr->work, head, work_flags);
}
@@ -3691,6 +3709,7 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs)
{
if (attrs) {
free_cpumask_var(attrs->cpumask);
+ free_cpumask_var(attrs->__pod_cpumask);
kfree(attrs);
}
}
@@ -3712,8 +3731,11 @@ struct workqueue_attrs *alloc_workqueue_attrs(void)
goto fail;
if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL))
goto fail;
+ if (!alloc_cpumask_var(&attrs->__pod_cpumask, GFP_KERNEL))
+ goto fail;
cpumask_copy(attrs->cpumask, cpu_possible_mask);
+ attrs->affn_scope = WQ_AFFN_DFL;
return attrs;
fail:
free_workqueue_attrs(attrs);
@@ -3725,12 +3747,26 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
{
to->nice = from->nice;
cpumask_copy(to->cpumask, from->cpumask);
+ cpumask_copy(to->__pod_cpumask, from->__pod_cpumask);
+ to->affn_strict = from->affn_strict;
+
/*
- * Unlike hash and equality test, this function doesn't ignore
- * ->no_numa as it is used for both pool and wq attrs. Instead,
- * get_unbound_pool() explicitly clears ->no_numa after copying.
+ * Unlike hash and equality test, copying shouldn't ignore wq-only
+ * fields as copying is used for both pool and wq attrs. Instead,
+ * get_unbound_pool() explicitly clears the fields.
*/
- to->no_numa = from->no_numa;
+ to->affn_scope = from->affn_scope;
+ to->ordered = from->ordered;
+}
+
+/*
+ * Some attrs fields are workqueue-only. Clear them for worker_pool's. See the
+ * comments in 'struct workqueue_attrs' definition.
+ */
+static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs)
+{
+ attrs->affn_scope = WQ_AFFN_NR_TYPES;
+ attrs->ordered = false;
}
/* hash value of the content of @attr */
@@ -3741,6 +3777,9 @@ static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
hash = jhash_1word(attrs->nice, hash);
hash = jhash(cpumask_bits(attrs->cpumask),
BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
+ hash = jhash(cpumask_bits(attrs->__pod_cpumask),
+ BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
+ hash = jhash_1word(attrs->affn_strict, hash);
return hash;
}
@@ -3752,9 +3791,57 @@ static bool wqattrs_equal(const struct workqueue_attrs *a,
return false;
if (!cpumask_equal(a->cpumask, b->cpumask))
return false;
+ if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask))
+ return false;
+ if (a->affn_strict != b->affn_strict)
+ return false;
return true;
}
+/* Update @attrs with actually available CPUs */
+static void wqattrs_actualize_cpumask(struct workqueue_attrs *attrs,
+ const cpumask_t *unbound_cpumask)
+{
+ /*
+ * Calculate the effective CPU mask of @attrs given @unbound_cpumask. If
+ * @attrs->cpumask doesn't overlap with @unbound_cpumask, we fallback to
+ * @unbound_cpumask.
+ */
+ cpumask_and(attrs->cpumask, attrs->cpumask, unbound_cpumask);
+ if (unlikely(cpumask_empty(attrs->cpumask)))
+ cpumask_copy(attrs->cpumask, unbound_cpumask);
+}
+
+/* find wq_pod_type to use for @attrs */
+static const struct wq_pod_type *
+wqattrs_pod_type(const struct workqueue_attrs *attrs)
+{
+ enum wq_affn_scope scope;
+ struct wq_pod_type *pt;
+
+ /* to synchronize access to wq_affn_dfl */
+ lockdep_assert_held(&wq_pool_mutex);
+
+ if (attrs->affn_scope == WQ_AFFN_DFL)
+ scope = wq_affn_dfl;
+ else
+ scope = attrs->affn_scope;
+
+ pt = &wq_pod_types[scope];
+
+ if (!WARN_ON_ONCE(attrs->affn_scope == WQ_AFFN_NR_TYPES) &&
+ likely(pt->nr_pods))
+ return pt;
+
+ /*
+ * Before workqueue_init_topology(), only SYSTEM is available which is
+ * initialized in workqueue_init_early().
+ */
+ pt = &wq_pod_types[WQ_AFFN_SYSTEM];
+ BUG_ON(!pt->nr_pods);
+ return pt;
+}
+
/**
* init_worker_pool - initialize a newly zalloc'd worker_pool
* @pool: worker_pool to initialize
@@ -3793,6 +3880,9 @@ static int init_worker_pool(struct worker_pool *pool)
pool->attrs = alloc_workqueue_attrs();
if (!pool->attrs)
return -ENOMEM;
+
+ wqattrs_clear_for_pool(pool->attrs);
+
return 0;
}
@@ -3840,12 +3930,8 @@ static void rcu_free_wq(struct rcu_head *rcu)
container_of(rcu, struct workqueue_struct, rcu);
wq_free_lockdep(wq);
-
- if (!(wq->flags & WQ_UNBOUND))
- free_percpu(wq->cpu_pwqs);
- else
- free_workqueue_attrs(wq->unbound_attrs);
-
+ free_percpu(wq->cpu_pwq);
+ free_workqueue_attrs(wq->unbound_attrs);
kfree(wq);
}
@@ -3872,10 +3958,8 @@ static void rcu_free_pool(struct rcu_head *rcu)
static void put_unbound_pool(struct worker_pool *pool)
{
DECLARE_COMPLETION_ONSTACK(detach_completion);
- struct list_head cull_list;
struct worker *worker;
-
- INIT_LIST_HEAD(&cull_list);
+ LIST_HEAD(cull_list);
lockdep_assert_held(&wq_pool_mutex);
@@ -3959,10 +4043,10 @@ static void put_unbound_pool(struct worker_pool *pool)
*/
static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
{
+ struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_NUMA];
u32 hash = wqattrs_hash(attrs);
struct worker_pool *pool;
- int node;
- int target_node = NUMA_NO_NODE;
+ int pod, node = NUMA_NO_NODE;
lockdep_assert_held(&wq_pool_mutex);
@@ -3974,31 +4058,22 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
}
}
- /* if cpumask is contained inside a NUMA node, we belong to that node */
- if (wq_numa_enabled) {
- for_each_node(node) {
- if (cpumask_subset(attrs->cpumask,
- wq_numa_possible_cpumask[node])) {
- target_node = node;
- break;
- }
+ /* If __pod_cpumask is contained inside a NUMA pod, that's our node */
+ for (pod = 0; pod < pt->nr_pods; pod++) {
+ if (cpumask_subset(attrs->__pod_cpumask, pt->pod_cpus[pod])) {
+ node = pt->pod_node[pod];
+ break;
}
}
/* nope, create a new one */
- pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_node);
+ pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, node);
if (!pool || init_worker_pool(pool) < 0)
goto fail;
- lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */
+ pool->node = node;
copy_workqueue_attrs(pool->attrs, attrs);
- pool->node = target_node;
-
- /*
- * no_numa isn't a worker_pool attribute, always clear it. See
- * 'struct workqueue_attrs' comments for detail.
- */
- pool->attrs->no_numa = false;
+ wqattrs_clear_for_pool(pool->attrs);
if (worker_pool_assign_id(pool) < 0)
goto fail;
@@ -4024,34 +4099,33 @@ static void rcu_free_pwq(struct rcu_head *rcu)
}
/*
- * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt
- * and needs to be destroyed.
+ * Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero
+ * refcnt and needs to be destroyed.
*/
-static void pwq_unbound_release_workfn(struct work_struct *work)
+static void pwq_release_workfn(struct kthread_work *work)
{
struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
- unbound_release_work);
+ release_work);
struct workqueue_struct *wq = pwq->wq;
struct worker_pool *pool = pwq->pool;
bool is_last = false;
/*
- * when @pwq is not linked, it doesn't hold any reference to the
+ * When @pwq is not linked, it doesn't hold any reference to the
* @wq, and @wq is invalid to access.
*/
if (!list_empty(&pwq->pwqs_node)) {
- if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
- return;
-
mutex_lock(&wq->mutex);
list_del_rcu(&pwq->pwqs_node);
is_last = list_empty(&wq->pwqs);
mutex_unlock(&wq->mutex);
}
- mutex_lock(&wq_pool_mutex);
- put_unbound_pool(pool);
- mutex_unlock(&wq_pool_mutex);
+ if (wq->flags & WQ_UNBOUND) {
+ mutex_lock(&wq_pool_mutex);
+ put_unbound_pool(pool);
+ mutex_unlock(&wq_pool_mutex);
+ }
call_rcu(&pwq->rcu, rcu_free_pwq);
@@ -4095,24 +4169,13 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)
* is updated and visible.
*/
if (!freezable || !workqueue_freezing) {
- bool kick = false;
-
pwq->max_active = wq->saved_max_active;
while (!list_empty(&pwq->inactive_works) &&
- pwq->nr_active < pwq->max_active) {
+ pwq->nr_active < pwq->max_active)
pwq_activate_first_inactive(pwq);
- kick = true;
- }
- /*
- * Need to kick a worker after thawed or an unbound wq's
- * max_active is bumped. In realtime scenarios, always kicking a
- * worker will cause interference on the isolated cpu cores, so
- * let's kick iff work items were activated.
- */
- if (kick)
- wake_up_worker(pwq->pool);
+ kick_pool(pwq->pool);
} else {
pwq->max_active = 0;
}
@@ -4135,7 +4198,7 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
INIT_LIST_HEAD(&pwq->inactive_works);
INIT_LIST_HEAD(&pwq->pwqs_node);
INIT_LIST_HEAD(&pwq->mayday_node);
- INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn);
+ kthread_init_work(&pwq->release_work, pwq_release_workfn);
}
/* sync @pwq with the current state of its associated wq and link it */
@@ -4183,61 +4246,49 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
}
/**
- * wq_calc_node_cpumask - calculate a wq_attrs' cpumask for the specified node
+ * wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod
* @attrs: the wq_attrs of the default pwq of the target workqueue
- * @node: the target NUMA node
+ * @cpu: the target CPU
* @cpu_going_down: if >= 0, the CPU to consider as offline
- * @cpumask: outarg, the resulting cpumask
- *
- * Calculate the cpumask a workqueue with @attrs should use on @node. If
- * @cpu_going_down is >= 0, that cpu is considered offline during
- * calculation. The result is stored in @cpumask.
*
- * If NUMA affinity is not enabled, @attrs->cpumask is always used. If
- * enabled and @node has online CPUs requested by @attrs, the returned
- * cpumask is the intersection of the possible CPUs of @node and
- * @attrs->cpumask.
+ * Calculate the cpumask a workqueue with @attrs should use on @pod. If
+ * @cpu_going_down is >= 0, that cpu is considered offline during calculation.
+ * The result is stored in @attrs->__pod_cpumask.
*
- * The caller is responsible for ensuring that the cpumask of @node stays
- * stable.
+ * If pod affinity is not enabled, @attrs->cpumask is always used. If enabled
+ * and @pod has online CPUs requested by @attrs, the returned cpumask is the
+ * intersection of the possible CPUs of @pod and @attrs->cpumask.
*
- * Return: %true if the resulting @cpumask is different from @attrs->cpumask,
- * %false if equal.
+ * The caller is responsible for ensuring that the cpumask of @pod stays stable.
*/
-static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
- int cpu_going_down, cpumask_t *cpumask)
+static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu,
+ int cpu_going_down)
{
- if (!wq_numa_enabled || attrs->no_numa)
- goto use_dfl;
+ const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
+ int pod = pt->cpu_pod[cpu];
- /* does @node have any online CPUs @attrs wants? */
- cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask);
+ /* does @pod have any online CPUs @attrs wants? */
+ cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask);
+ cpumask_and(attrs->__pod_cpumask, attrs->__pod_cpumask, cpu_online_mask);
if (cpu_going_down >= 0)
- cpumask_clear_cpu(cpu_going_down, cpumask);
+ cpumask_clear_cpu(cpu_going_down, attrs->__pod_cpumask);
- if (cpumask_empty(cpumask))
- goto use_dfl;
+ if (cpumask_empty(attrs->__pod_cpumask)) {
+ cpumask_copy(attrs->__pod_cpumask, attrs->cpumask);
+ return;
+ }
- /* yeap, return possible CPUs in @node that @attrs wants */
- cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
+ /* yeap, return possible CPUs in @pod that @attrs wants */
+ cpumask_and(attrs->__pod_cpumask, attrs->cpumask, pt->pod_cpus[pod]);
- if (cpumask_empty(cpumask)) {
+ if (cpumask_empty(attrs->__pod_cpumask))
pr_warn_once("WARNING: workqueue cpumask: online intersect > "
"possible intersect\n");
- return false;
- }
-
- return !cpumask_equal(cpumask, attrs->cpumask);
-
-use_dfl:
- cpumask_copy(cpumask, attrs->cpumask);
- return false;
}
-/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */
-static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
- int node,
- struct pool_workqueue *pwq)
+/* install @pwq into @wq's cpu_pwq and return the old pwq */
+static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq,
+ int cpu, struct pool_workqueue *pwq)
{
struct pool_workqueue *old_pwq;
@@ -4247,8 +4298,8 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
/* link_pwq() can handle duplicate calls */
link_pwq(pwq);
- old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
- rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq);
+ old_pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu));
+ rcu_assign_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu), pwq);
return old_pwq;
}
@@ -4265,10 +4316,10 @@ struct apply_wqattrs_ctx {
static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
{
if (ctx) {
- int node;
+ int cpu;
- for_each_node(node)
- put_pwq_unlocked(ctx->pwq_tbl[node]);
+ for_each_possible_cpu(cpu)
+ put_pwq_unlocked(ctx->pwq_tbl[cpu]);
put_pwq_unlocked(ctx->dfl_pwq);
free_workqueue_attrs(ctx->attrs);
@@ -4284,76 +4335,64 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
const cpumask_var_t unbound_cpumask)
{
struct apply_wqattrs_ctx *ctx;
- struct workqueue_attrs *new_attrs, *tmp_attrs;
- int node;
+ struct workqueue_attrs *new_attrs;
+ int cpu;
lockdep_assert_held(&wq_pool_mutex);
- ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL);
+ if (WARN_ON(attrs->affn_scope < 0 ||
+ attrs->affn_scope >= WQ_AFFN_NR_TYPES))
+ return ERR_PTR(-EINVAL);
+
+ ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_cpu_ids), GFP_KERNEL);
new_attrs = alloc_workqueue_attrs();
- tmp_attrs = alloc_workqueue_attrs();
- if (!ctx || !new_attrs || !tmp_attrs)
+ if (!ctx || !new_attrs)
goto out_free;
/*
- * Calculate the attrs of the default pwq with unbound_cpumask
- * which is wq_unbound_cpumask or to set to wq_unbound_cpumask.
- * If the user configured cpumask doesn't overlap with the
- * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask.
- */
- copy_workqueue_attrs(new_attrs, attrs);
- cpumask_and(new_attrs->cpumask, new_attrs->cpumask, unbound_cpumask);
- if (unlikely(cpumask_empty(new_attrs->cpumask)))
- cpumask_copy(new_attrs->cpumask, unbound_cpumask);
-
- /*
- * We may create multiple pwqs with differing cpumasks. Make a
- * copy of @new_attrs which will be modified and used to obtain
- * pools.
- */
- copy_workqueue_attrs(tmp_attrs, new_attrs);
-
- /*
* If something goes wrong during CPU up/down, we'll fall back to
* the default pwq covering whole @attrs->cpumask. Always create
* it even if we don't use it immediately.
*/
+ copy_workqueue_attrs(new_attrs, attrs);
+ wqattrs_actualize_cpumask(new_attrs, unbound_cpumask);
+ cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
if (!ctx->dfl_pwq)
goto out_free;
- for_each_node(node) {
- if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) {
- ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
- if (!ctx->pwq_tbl[node])
- goto out_free;
- } else {
+ for_each_possible_cpu(cpu) {
+ if (new_attrs->ordered) {
ctx->dfl_pwq->refcnt++;
- ctx->pwq_tbl[node] = ctx->dfl_pwq;
+ ctx->pwq_tbl[cpu] = ctx->dfl_pwq;
+ } else {
+ wq_calc_pod_cpumask(new_attrs, cpu, -1);
+ ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs);
+ if (!ctx->pwq_tbl[cpu])
+ goto out_free;
}
}
/* save the user configured attrs and sanitize it. */
copy_workqueue_attrs(new_attrs, attrs);
cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+ cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
ctx->attrs = new_attrs;
ctx->wq = wq;
- free_workqueue_attrs(tmp_attrs);
return ctx;
out_free:
- free_workqueue_attrs(tmp_attrs);
free_workqueue_attrs(new_attrs);
apply_wqattrs_cleanup(ctx);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
/* set attrs and install prepared pwqs, @ctx points to old pwqs on return */
static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
{
- int node;
+ int cpu;
/* all pwqs have been created successfully, let's install'em */
mutex_lock(&ctx->wq->mutex);
@@ -4361,9 +4400,9 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);
/* save the previous pwq and install the new one */
- for_each_node(node)
- ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node,
- ctx->pwq_tbl[node]);
+ for_each_possible_cpu(cpu)
+ ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu,
+ ctx->pwq_tbl[cpu]);
/* @dfl_pwq might not have been used, ensure it's linked */
link_pwq(ctx->dfl_pwq);
@@ -4403,8 +4442,8 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
}
ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask);
- if (!ctx)
- return -ENOMEM;
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
/* the ctx has been prepared successfully, let's commit it */
apply_wqattrs_commit(ctx);
@@ -4418,12 +4457,11 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
* @wq: the target workqueue
* @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
*
- * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA
- * machines, this function maps a separate pwq to each NUMA node with
- * possibles CPUs in @attrs->cpumask so that work items are affine to the
- * NUMA node it was issued on. Older pwqs are released as in-flight work
- * items finish. Note that a work item which repeatedly requeues itself
- * back-to-back will stay on its current pwq.
+ * Apply @attrs to an unbound workqueue @wq. Unless disabled, this function maps
+ * a separate pwq to each CPU pod with possibles CPUs in @attrs->cpumask so that
+ * work items are affine to the pod it was issued on. Older pwqs are released as
+ * in-flight work items finish. Note that a work item which repeatedly requeues
+ * itself back-to-back will stay on its current pwq.
*
* Performs GFP_KERNEL allocations.
*
@@ -4446,40 +4484,37 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
}
/**
- * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
+ * wq_update_pod - update pod affinity of a wq for CPU hot[un]plug
* @wq: the target workqueue
- * @cpu: the CPU coming up or going down
+ * @cpu: the CPU to update pool association for
+ * @hotplug_cpu: the CPU coming up or going down
* @online: whether @cpu is coming up or going down
*
* This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
- * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update NUMA affinity of
+ * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update pod affinity of
* @wq accordingly.
*
- * If NUMA affinity can't be adjusted due to memory allocation failure, it
- * falls back to @wq->dfl_pwq which may not be optimal but is always
- * correct.
- *
- * Note that when the last allowed CPU of a NUMA node goes offline for a
- * workqueue with a cpumask spanning multiple nodes, the workers which were
- * already executing the work items for the workqueue will lose their CPU
- * affinity and may execute on any CPU. This is similar to how per-cpu
- * workqueues behave on CPU_DOWN. If a workqueue user wants strict
- * affinity, it's the user's responsibility to flush the work item from
- * CPU_DOWN_PREPARE.
+ *
+ * If pod affinity can't be adjusted due to memory allocation failure, it falls
+ * back to @wq->dfl_pwq which may not be optimal but is always correct.
+ *
+ * Note that when the last allowed CPU of a pod goes offline for a workqueue
+ * with a cpumask spanning multiple pods, the workers which were already
+ * executing the work items for the workqueue will lose their CPU affinity and
+ * may execute on any CPU. This is similar to how per-cpu workqueues behave on
+ * CPU_DOWN. If a workqueue user wants strict affinity, it's the user's
+ * responsibility to flush the work item from CPU_DOWN_PREPARE.
*/
-static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
- bool online)
+static void wq_update_pod(struct workqueue_struct *wq, int cpu,
+ int hotplug_cpu, bool online)
{
- int node = cpu_to_node(cpu);
- int cpu_off = online ? -1 : cpu;
+ int off_cpu = online ? -1 : hotplug_cpu;
struct pool_workqueue *old_pwq = NULL, *pwq;
struct workqueue_attrs *target_attrs;
- cpumask_t *cpumask;
lockdep_assert_held(&wq_pool_mutex);
- if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND) ||
- wq->unbound_attrs->no_numa)
+ if (!(wq->flags & WQ_UNBOUND) || wq->unbound_attrs->ordered)
return;
/*
@@ -4487,36 +4522,29 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
* Let's use a preallocated one. The following buf is protected by
* CPU hotplug exclusion.
*/
- target_attrs = wq_update_unbound_numa_attrs_buf;
- cpumask = target_attrs->cpumask;
+ target_attrs = wq_update_pod_attrs_buf;
copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
- pwq = unbound_pwq_by_node(wq, node);
+ wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask);
- /*
- * Let's determine what needs to be done. If the target cpumask is
- * different from the default pwq's, we need to compare it to @pwq's
- * and create a new one if they don't match. If the target cpumask
- * equals the default pwq's, the default pwq should be used.
- */
- if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpu_off, cpumask)) {
- if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
- return;
- } else {
- goto use_dfl_pwq;
- }
+ /* nothing to do if the target cpumask matches the current pwq */
+ wq_calc_pod_cpumask(target_attrs, cpu, off_cpu);
+ pwq = rcu_dereference_protected(*per_cpu_ptr(wq->cpu_pwq, cpu),
+ lockdep_is_held(&wq_pool_mutex));
+ if (wqattrs_equal(target_attrs, pwq->pool->attrs))
+ return;
/* create a new pwq */
pwq = alloc_unbound_pwq(wq, target_attrs);
if (!pwq) {
- pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
+ pr_warn("workqueue: allocation failed while updating CPU pod affinity of \"%s\"\n",
wq->name);
goto use_dfl_pwq;
}
/* Install the new pwq. */
mutex_lock(&wq->mutex);
- old_pwq = numa_pwq_tbl_install(wq, node, pwq);
+ old_pwq = install_unbound_pwq(wq, cpu, pwq);
goto out_unlock;
use_dfl_pwq:
@@ -4524,7 +4552,7 @@ use_dfl_pwq:
raw_spin_lock_irq(&wq->dfl_pwq->pool->lock);
get_pwq(wq->dfl_pwq);
raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock);
- old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq);
+ old_pwq = install_unbound_pwq(wq, cpu, wq->dfl_pwq);
out_unlock:
mutex_unlock(&wq->mutex);
put_pwq_unlocked(old_pwq);
@@ -4535,21 +4563,26 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
bool highpri = wq->flags & WQ_HIGHPRI;
int cpu, ret;
- if (!(wq->flags & WQ_UNBOUND)) {
- wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
- if (!wq->cpu_pwqs)
- return -ENOMEM;
+ wq->cpu_pwq = alloc_percpu(struct pool_workqueue *);
+ if (!wq->cpu_pwq)
+ goto enomem;
+ if (!(wq->flags & WQ_UNBOUND)) {
for_each_possible_cpu(cpu) {
- struct pool_workqueue *pwq =
- per_cpu_ptr(wq->cpu_pwqs, cpu);
- struct worker_pool *cpu_pools =
- per_cpu(cpu_worker_pools, cpu);
+ struct pool_workqueue **pwq_p =
+ per_cpu_ptr(wq->cpu_pwq, cpu);
+ struct worker_pool *pool =
+ &(per_cpu_ptr(cpu_worker_pools, cpu)[highpri]);
- init_pwq(pwq, wq, &cpu_pools[highpri]);
+ *pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL,
+ pool->node);
+ if (!*pwq_p)
+ goto enomem;
+
+ init_pwq(*pwq_p, wq, pool);
mutex_lock(&wq->mutex);
- link_pwq(pwq);
+ link_pwq(*pwq_p);
mutex_unlock(&wq->mutex);
}
return 0;
@@ -4568,18 +4601,25 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
cpus_read_unlock();
return ret;
+
+enomem:
+ if (wq->cpu_pwq) {
+ for_each_possible_cpu(cpu)
+ kfree(*per_cpu_ptr(wq->cpu_pwq, cpu));
+ free_percpu(wq->cpu_pwq);
+ wq->cpu_pwq = NULL;
+ }
+ return -ENOMEM;
}
static int wq_clamp_max_active(int max_active, unsigned int flags,
const char *name)
{
- int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
-
- if (max_active < 1 || max_active > lim)
+ if (max_active < 1 || max_active > WQ_MAX_ACTIVE)
pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
- max_active, name, 1, lim);
+ max_active, name, 1, WQ_MAX_ACTIVE);
- return clamp_val(max_active, 1, lim);
+ return clamp_val(max_active, 1, WQ_MAX_ACTIVE);
}
/*
@@ -4602,7 +4642,7 @@ static int init_rescuer(struct workqueue_struct *wq)
}
rescuer->rescue_wq = wq;
- rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name);
+ rescuer->task = kthread_create(rescuer_thread, rescuer, "kworker/R-%s", wq->name);
if (IS_ERR(rescuer->task)) {
ret = PTR_ERR(rescuer->task);
pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe",
@@ -4623,17 +4663,15 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
unsigned int flags,
int max_active, ...)
{
- size_t tbl_size = 0;
va_list args;
struct workqueue_struct *wq;
struct pool_workqueue *pwq;
/*
- * Unbound && max_active == 1 used to imply ordered, which is no
- * longer the case on NUMA machines due to per-node pools. While
+ * Unbound && max_active == 1 used to imply ordered, which is no longer
+ * the case on many machines due to per-pod pools. While
* alloc_ordered_workqueue() is the right way to create an ordered
- * workqueue, keep the previous behavior to avoid subtle breakages
- * on NUMA.
+ * workqueue, keep the previous behavior to avoid subtle breakages.
*/
if ((flags & WQ_UNBOUND) && max_active == 1)
flags |= __WQ_ORDERED;
@@ -4643,10 +4681,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
flags |= WQ_UNBOUND;
/* allocate wq and format name */
- if (flags & WQ_UNBOUND)
- tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]);
-
- wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
+ wq = kzalloc(sizeof(*wq), GFP_KERNEL);
if (!wq)
return NULL;
@@ -4741,7 +4776,7 @@ static bool pwq_busy(struct pool_workqueue *pwq)
void destroy_workqueue(struct workqueue_struct *wq)
{
struct pool_workqueue *pwq;
- int node;
+ int cpu;
/*
* Remove it from sysfs first so that sanity check failure doesn't
@@ -4800,33 +4835,23 @@ void destroy_workqueue(struct workqueue_struct *wq)
list_del_rcu(&wq->list);
mutex_unlock(&wq_pool_mutex);
- if (!(wq->flags & WQ_UNBOUND)) {
- wq_unregister_lockdep(wq);
- /*
- * The base ref is never dropped on per-cpu pwqs. Directly
- * schedule RCU free.
- */
- call_rcu(&wq->rcu, rcu_free_wq);
- } else {
- /*
- * We're the sole accessor of @wq at this point. Directly
- * access numa_pwq_tbl[] and dfl_pwq to put the base refs.
- * @wq will be freed when the last pwq is released.
- */
- for_each_node(node) {
- pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
- RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL);
- put_pwq_unlocked(pwq);
- }
+ /*
+ * We're the sole accessor of @wq. Directly access cpu_pwq and dfl_pwq
+ * to put the base refs. @wq will be auto-destroyed from the last
+ * pwq_put. RCU read lock prevents @wq from going away from under us.
+ */
+ rcu_read_lock();
- /*
- * Put dfl_pwq. @wq may be freed any time after dfl_pwq is
- * put. Don't access it afterwards.
- */
- pwq = wq->dfl_pwq;
- wq->dfl_pwq = NULL;
+ for_each_possible_cpu(cpu) {
+ pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu));
+ RCU_INIT_POINTER(*per_cpu_ptr(wq->cpu_pwq, cpu), NULL);
put_pwq_unlocked(pwq);
}
+
+ put_pwq_unlocked(wq->dfl_pwq);
+ wq->dfl_pwq = NULL;
+
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(destroy_workqueue);
@@ -4903,10 +4928,11 @@ bool current_is_workqueue_rescuer(void)
* unreliable and only useful as advisory hints or for debugging.
*
* If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU.
- * Note that both per-cpu and unbound workqueues may be associated with
- * multiple pool_workqueues which have separate congested states. A
- * workqueue being congested on one CPU doesn't mean the workqueue is also
- * contested on other CPUs / NUMA nodes.
+ *
+ * With the exception of ordered workqueues, all workqueues have per-cpu
+ * pool_workqueues, each with its own congested state. A workqueue being
+ * congested on one CPU doesn't mean that the workqueue is contested on any
+ * other CPUs.
*
* Return:
* %true if congested, %false otherwise.
@@ -4922,12 +4948,9 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
if (cpu == WORK_CPU_UNBOUND)
cpu = smp_processor_id();
- if (!(wq->flags & WQ_UNBOUND))
- pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
- else
- pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
-
+ pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);
ret = !list_empty(&pwq->inactive_works);
+
preempt_enable();
rcu_read_unlock();
@@ -5402,7 +5425,7 @@ static void unbind_workers(int cpu)
* worker blocking could lead to lengthy stalls. Kick off
* unbound chain execution of currently pending work items.
*/
- wake_up_worker(pool);
+ kick_pool(pool);
raw_spin_unlock_irq(&pool->lock);
@@ -5435,7 +5458,7 @@ static void rebind_workers(struct worker_pool *pool)
for_each_pool_worker(worker, pool) {
kthread_set_per_cpu(worker->task, pool->cpu);
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
- pool->attrs->cpumask) < 0);
+ pool_allowed_cpus(pool)) < 0);
}
raw_spin_lock_irq(&pool->lock);
@@ -5529,9 +5552,18 @@ int workqueue_online_cpu(unsigned int cpu)
mutex_unlock(&wq_pool_attach_mutex);
}
- /* update NUMA affinity of unbound workqueues */
- list_for_each_entry(wq, &workqueues, list)
- wq_update_unbound_numa(wq, cpu, true);
+ /* update pod affinity of unbound workqueues */
+ list_for_each_entry(wq, &workqueues, list) {
+ struct workqueue_attrs *attrs = wq->unbound_attrs;
+
+ if (attrs) {
+ const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
+ int tcpu;
+
+ for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
+ wq_update_pod(wq, tcpu, cpu, true);
+ }
+ }
mutex_unlock(&wq_pool_mutex);
return 0;
@@ -5547,10 +5579,19 @@ int workqueue_offline_cpu(unsigned int cpu)
unbind_workers(cpu);
- /* update NUMA affinity of unbound workqueues */
+ /* update pod affinity of unbound workqueues */
mutex_lock(&wq_pool_mutex);
- list_for_each_entry(wq, &workqueues, list)
- wq_update_unbound_numa(wq, cpu, false);
+ list_for_each_entry(wq, &workqueues, list) {
+ struct workqueue_attrs *attrs = wq->unbound_attrs;
+
+ if (attrs) {
+ const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
+ int tcpu;
+
+ for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
+ wq_update_pod(wq, tcpu, cpu, false);
+ }
+ }
mutex_unlock(&wq_pool_mutex);
return 0;
@@ -5746,8 +5787,8 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
continue;
ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask);
- if (!ctx) {
- ret = -ENOMEM;
+ if (IS_ERR(ctx)) {
+ ret = PTR_ERR(ctx);
break;
}
@@ -5805,21 +5846,72 @@ out_unlock:
return ret;
}
+static int parse_affn_scope(const char *val)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(wq_affn_names); i++) {
+ if (!strncasecmp(val, wq_affn_names[i], strlen(wq_affn_names[i])))
+ return i;
+ }
+ return -EINVAL;
+}
+
+static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp)
+{
+ struct workqueue_struct *wq;
+ int affn, cpu;
+
+ affn = parse_affn_scope(val);
+ if (affn < 0)
+ return affn;
+ if (affn == WQ_AFFN_DFL)
+ return -EINVAL;
+
+ cpus_read_lock();
+ mutex_lock(&wq_pool_mutex);
+
+ wq_affn_dfl = affn;
+
+ list_for_each_entry(wq, &workqueues, list) {
+ for_each_online_cpu(cpu) {
+ wq_update_pod(wq, cpu, cpu, true);
+ }
+ }
+
+ mutex_unlock(&wq_pool_mutex);
+ cpus_read_unlock();
+
+ return 0;
+}
+
+static int wq_affn_dfl_get(char *buffer, const struct kernel_param *kp)
+{
+ return scnprintf(buffer, PAGE_SIZE, "%s\n", wq_affn_names[wq_affn_dfl]);
+}
+
+static const struct kernel_param_ops wq_affn_dfl_ops = {
+ .set = wq_affn_dfl_set,
+ .get = wq_affn_dfl_get,
+};
+
+module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, 0644);
+
#ifdef CONFIG_SYSFS
/*
* Workqueues with WQ_SYSFS flag set is visible to userland via
* /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the
* following attributes.
*
- * per_cpu RO bool : whether the workqueue is per-cpu or unbound
- * max_active RW int : maximum number of in-flight work items
+ * per_cpu RO bool : whether the workqueue is per-cpu or unbound
+ * max_active RW int : maximum number of in-flight work items
*
* Unbound workqueues have the following extra attributes.
*
- * pool_ids RO int : the associated pool IDs for each node
- * nice RW int : nice value of the workers
- * cpumask RW mask : bitmask of allowed CPUs for the workers
- * numa RW bool : whether enable NUMA affinity
+ * nice RW int : nice value of the workers
+ * cpumask RW mask : bitmask of allowed CPUs for the workers
+ * affinity_scope RW str : worker CPU affinity scope (cache, numa, none)
+ * affinity_strict RW bool : worker CPU affinity is strict
*/
struct wq_device {
struct workqueue_struct *wq;
@@ -5872,28 +5964,6 @@ static struct attribute *wq_sysfs_attrs[] = {
};
ATTRIBUTE_GROUPS(wq_sysfs);
-static ssize_t wq_pool_ids_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- const char *delim = "";
- int node, written = 0;
-
- cpus_read_lock();
- rcu_read_lock();
- for_each_node(node) {
- written += scnprintf(buf + written, PAGE_SIZE - written,
- "%s%d:%d", delim, node,
- unbound_pwq_by_node(wq, node)->pool->id);
- delim = " ";
- }
- written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
- rcu_read_unlock();
- cpus_read_unlock();
-
- return written;
-}
-
static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
@@ -5984,50 +6054,84 @@ out_unlock:
return ret ?: count;
}
-static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
- char *buf)
+static ssize_t wq_affn_scope_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct workqueue_struct *wq = dev_to_wq(dev);
int written;
mutex_lock(&wq->mutex);
- written = scnprintf(buf, PAGE_SIZE, "%d\n",
- !wq->unbound_attrs->no_numa);
+ if (wq->unbound_attrs->affn_scope == WQ_AFFN_DFL)
+ written = scnprintf(buf, PAGE_SIZE, "%s (%s)\n",
+ wq_affn_names[WQ_AFFN_DFL],
+ wq_affn_names[wq_affn_dfl]);
+ else
+ written = scnprintf(buf, PAGE_SIZE, "%s\n",
+ wq_affn_names[wq->unbound_attrs->affn_scope]);
mutex_unlock(&wq->mutex);
return written;
}
-static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t wq_affn_scope_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
{
struct workqueue_struct *wq = dev_to_wq(dev);
struct workqueue_attrs *attrs;
- int v, ret = -ENOMEM;
+ int affn, ret = -ENOMEM;
- apply_wqattrs_lock();
+ affn = parse_affn_scope(buf);
+ if (affn < 0)
+ return affn;
+ apply_wqattrs_lock();
attrs = wq_sysfs_prep_attrs(wq);
- if (!attrs)
- goto out_unlock;
-
- ret = -EINVAL;
- if (sscanf(buf, "%d", &v) == 1) {
- attrs->no_numa = !v;
+ if (attrs) {
+ attrs->affn_scope = affn;
ret = apply_workqueue_attrs_locked(wq, attrs);
}
+ apply_wqattrs_unlock();
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
-out_unlock:
+static ssize_t wq_affinity_strict_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n",
+ wq->unbound_attrs->affn_strict);
+}
+
+static ssize_t wq_affinity_strict_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int v, ret = -ENOMEM;
+
+ if (sscanf(buf, "%d", &v) != 1)
+ return -EINVAL;
+
+ apply_wqattrs_lock();
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (attrs) {
+ attrs->affn_strict = (bool)v;
+ ret = apply_workqueue_attrs_locked(wq, attrs);
+ }
apply_wqattrs_unlock();
free_workqueue_attrs(attrs);
return ret ?: count;
}
static struct device_attribute wq_sysfs_unbound_attrs[] = {
- __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
__ATTR(nice, 0644, wq_nice_show, wq_nice_store),
__ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
- __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
+ __ATTR(affinity_scope, 0644, wq_affn_scope_show, wq_affn_scope_store),
+ __ATTR(affinity_strict, 0644, wq_affinity_strict_show, wq_affinity_strict_store),
__ATTR_NULL,
};
@@ -6393,62 +6497,19 @@ static inline void wq_watchdog_init(void) { }
#endif /* CONFIG_WQ_WATCHDOG */
-static void __init wq_numa_init(void)
-{
- cpumask_var_t *tbl;
- int node, cpu;
-
- if (num_possible_nodes() <= 1)
- return;
-
- if (wq_disable_numa) {
- pr_info("workqueue: NUMA affinity support disabled\n");
- return;
- }
-
- for_each_possible_cpu(cpu) {
- if (WARN_ON(cpu_to_node(cpu) == NUMA_NO_NODE)) {
- pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu);
- return;
- }
- }
-
- wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs();
- BUG_ON(!wq_update_unbound_numa_attrs_buf);
-
- /*
- * We want masks of possible CPUs of each node which isn't readily
- * available. Build one from cpu_to_node() which should have been
- * fully initialized by now.
- */
- tbl = kcalloc(nr_node_ids, sizeof(tbl[0]), GFP_KERNEL);
- BUG_ON(!tbl);
-
- for_each_node(node)
- BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
- node_online(node) ? node : NUMA_NO_NODE));
-
- for_each_possible_cpu(cpu) {
- node = cpu_to_node(cpu);
- cpumask_set_cpu(cpu, tbl[node]);
- }
-
- wq_numa_possible_cpumask = tbl;
- wq_numa_enabled = true;
-}
-
/**
* workqueue_init_early - early init for workqueue subsystem
*
- * This is the first half of two-staged workqueue subsystem initialization
- * and invoked as soon as the bare basics - memory allocation, cpumasks and
- * idr are up. It sets up all the data structures and system workqueues
- * and allows early boot code to create workqueues and queue/cancel work
- * items. Actual work item execution starts only after kthreads can be
- * created and scheduled right before early initcalls.
+ * This is the first step of three-staged workqueue subsystem initialization and
+ * invoked as soon as the bare basics - memory allocation, cpumasks and idr are
+ * up. It sets up all the data structures and system workqueues and allows early
+ * boot code to create workqueues and queue/cancel work items. Actual work item
+ * execution starts only after kthreads can be created and scheduled right
+ * before early initcalls.
*/
void __init workqueue_init_early(void)
{
+ struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM];
int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
int i, cpu;
@@ -6458,8 +6519,30 @@ void __init workqueue_init_early(void)
cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ));
cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN));
+ if (!cpumask_empty(&wq_cmdline_cpumask))
+ cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, &wq_cmdline_cpumask);
+
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
+ wq_update_pod_attrs_buf = alloc_workqueue_attrs();
+ BUG_ON(!wq_update_pod_attrs_buf);
+
+ /* initialize WQ_AFFN_SYSTEM pods */
+ pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL);
+ pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL);
+ pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL);
+ BUG_ON(!pt->pod_cpus || !pt->pod_node || !pt->cpu_pod);
+
+ BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE));
+
+ wq_update_pod_attrs_buf = alloc_workqueue_attrs();
+ BUG_ON(!wq_update_pod_attrs_buf);
+
+ pt->nr_pods = 1;
+ cpumask_copy(pt->pod_cpus[0], cpu_possible_mask);
+ pt->pod_node[0] = NUMA_NO_NODE;
+ pt->cpu_pod[0] = 0;
+
/* initialize CPU pools */
for_each_possible_cpu(cpu) {
struct worker_pool *pool;
@@ -6469,7 +6552,9 @@ void __init workqueue_init_early(void)
BUG_ON(init_worker_pool(pool));
pool->cpu = cpu;
cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
+ cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu));
pool->attrs->nice = std_nice[i++];
+ pool->attrs->affn_strict = true;
pool->node = cpu_to_node(cpu);
/* alloc pool ID */
@@ -6490,11 +6575,10 @@ void __init workqueue_init_early(void)
/*
* An ordered wq should have only one pwq as ordering is
* guaranteed by max_active which is enforced by pwqs.
- * Turn off NUMA so that dfl_pwq is used for all nodes.
*/
BUG_ON(!(attrs = alloc_workqueue_attrs()));
attrs->nice = std_nice[i];
- attrs->no_numa = true;
+ attrs->ordered = true;
ordered_wq_attrs[i] = attrs;
}
@@ -6502,7 +6586,7 @@ void __init workqueue_init_early(void)
system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
system_long_wq = alloc_workqueue("events_long", 0, 0);
system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
- WQ_UNBOUND_MAX_ACTIVE);
+ WQ_MAX_ACTIVE);
system_freezable_wq = alloc_workqueue("events_freezable",
WQ_FREEZABLE, 0);
system_power_efficient_wq = alloc_workqueue("events_power_efficient",
@@ -6525,6 +6609,9 @@ static void __init wq_cpu_intensive_thresh_init(void)
if (wq_cpu_intensive_thresh_us != ULONG_MAX)
return;
+ pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release");
+ BUG_ON(IS_ERR(pwq_release_worker));
+
/*
* The default of 10ms is derived from the fact that most modern (as of
* 2023) processors can do a lot in 10ms and that it's just below what
@@ -6555,11 +6642,11 @@ static void __init wq_cpu_intensive_thresh_init(void)
/**
* workqueue_init - bring workqueue subsystem fully online
*
- * This is the latter half of two-staged workqueue subsystem initialization
- * and invoked as soon as kthreads can be created and scheduled.
- * Workqueues have been created and work items queued on them, but there
- * are no kworkers executing the work items yet. Populate the worker pools
- * with the initial workers and enable future kworker creations.
+ * This is the second step of three-staged workqueue subsystem initialization
+ * and invoked as soon as kthreads can be created and scheduled. Workqueues have
+ * been created and work items queued on them, but there are no kworkers
+ * executing the work items yet. Populate the worker pools with the initial
+ * workers and enable future kworker creations.
*/
void __init workqueue_init(void)
{
@@ -6569,19 +6656,12 @@ void __init workqueue_init(void)
wq_cpu_intensive_thresh_init();
- /*
- * It'd be simpler to initialize NUMA in workqueue_init_early() but
- * CPU to node mapping may not be available that early on some
- * archs such as power and arm64. As per-cpu pools created
- * previously could be missing node hint and unbound pools NUMA
- * affinity, fix them up.
- *
- * Also, while iterating workqueues, create rescuers if requested.
- */
- wq_numa_init();
-
mutex_lock(&wq_pool_mutex);
+ /*
+ * Per-cpu pools created earlier could be missing node hint. Fix them
+ * up. Also, create a rescuer for workqueues that requested it.
+ */
for_each_possible_cpu(cpu) {
for_each_cpu_worker_pool(pool, cpu) {
pool->node = cpu_to_node(cpu);
@@ -6589,7 +6669,6 @@ void __init workqueue_init(void)
}
list_for_each_entry(wq, &workqueues, list) {
- wq_update_unbound_numa(wq, smp_processor_id(), true);
WARN(init_rescuer(wq),
"workqueue: failed to create early rescuer for %s",
wq->name);
@@ -6613,9 +6692,114 @@ void __init workqueue_init(void)
}
/*
- * Despite the naming, this is a no-op function which is here only for avoiding
- * link error. Since compile-time warning may fail to catch, we will need to
- * emit run-time warning from __flush_workqueue().
+ * Initialize @pt by first initializing @pt->cpu_pod[] with pod IDs according to
+ * @cpu_shares_pod(). Each subset of CPUs that share a pod is assigned a unique
+ * and consecutive pod ID. The rest of @pt is initialized accordingly.
+ */
+static void __init init_pod_type(struct wq_pod_type *pt,
+ bool (*cpus_share_pod)(int, int))
+{
+ int cur, pre, cpu, pod;
+
+ pt->nr_pods = 0;
+
+ /* init @pt->cpu_pod[] according to @cpus_share_pod() */
+ pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL);
+ BUG_ON(!pt->cpu_pod);
+
+ for_each_possible_cpu(cur) {
+ for_each_possible_cpu(pre) {
+ if (pre >= cur) {
+ pt->cpu_pod[cur] = pt->nr_pods++;
+ break;
+ }
+ if (cpus_share_pod(cur, pre)) {
+ pt->cpu_pod[cur] = pt->cpu_pod[pre];
+ break;
+ }
+ }
+ }
+
+ /* init the rest to match @pt->cpu_pod[] */
+ pt->pod_cpus = kcalloc(pt->nr_pods, sizeof(pt->pod_cpus[0]), GFP_KERNEL);
+ pt->pod_node = kcalloc(pt->nr_pods, sizeof(pt->pod_node[0]), GFP_KERNEL);
+ BUG_ON(!pt->pod_cpus || !pt->pod_node);
+
+ for (pod = 0; pod < pt->nr_pods; pod++)
+ BUG_ON(!zalloc_cpumask_var(&pt->pod_cpus[pod], GFP_KERNEL));
+
+ for_each_possible_cpu(cpu) {
+ cpumask_set_cpu(cpu, pt->pod_cpus[pt->cpu_pod[cpu]]);
+ pt->pod_node[pt->cpu_pod[cpu]] = cpu_to_node(cpu);
+ }
+}
+
+static bool __init cpus_dont_share(int cpu0, int cpu1)
+{
+ return false;
+}
+
+static bool __init cpus_share_smt(int cpu0, int cpu1)
+{
+#ifdef CONFIG_SCHED_SMT
+ return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1));
+#else
+ return false;
+#endif
+}
+
+static bool __init cpus_share_numa(int cpu0, int cpu1)
+{
+ return cpu_to_node(cpu0) == cpu_to_node(cpu1);
+}
+
+/**
+ * workqueue_init_topology - initialize CPU pods for unbound workqueues
+ *
+ * This is the third step of there-staged workqueue subsystem initialization and
+ * invoked after SMP and topology information are fully initialized. It
+ * initializes the unbound CPU pods accordingly.
*/
-void __warn_flushing_systemwide_wq(void) { }
+void __init workqueue_init_topology(void)
+{
+ struct workqueue_struct *wq;
+ int cpu;
+
+ init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share);
+ init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt);
+ init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache);
+ init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa);
+
+ mutex_lock(&wq_pool_mutex);
+
+ /*
+ * Workqueues allocated earlier would have all CPUs sharing the default
+ * worker pool. Explicitly call wq_update_pod() on all workqueue and CPU
+ * combinations to apply per-pod sharing.
+ */
+ list_for_each_entry(wq, &workqueues, list) {
+ for_each_online_cpu(cpu) {
+ wq_update_pod(wq, cpu, cpu, true);
+ }
+ }
+
+ mutex_unlock(&wq_pool_mutex);
+}
+
+void __warn_flushing_systemwide_wq(void)
+{
+ pr_warn("WARNING: Flushing system-wide workqueues will be prohibited in near future.\n");
+ dump_stack();
+}
EXPORT_SYMBOL(__warn_flushing_systemwide_wq);
+
+static int __init workqueue_unbound_cpus_setup(char *str)
+{
+ if (cpulist_parse(str, &wq_cmdline_cpumask) < 0) {
+ cpumask_clear(&wq_cmdline_cpumask);
+ pr_warn("workqueue.unbound_cpus: incorrect CPU range, using default\n");
+ }
+
+ return 1;
+}
+__setup("workqueue.unbound_cpus=", workqueue_unbound_cpus_setup);
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 6b1d66e28269..f6275944ada7 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -48,7 +48,7 @@ struct worker {
/* A: runs through worker->node */
unsigned long last_active; /* K: last active timestamp */
- unsigned int flags; /* X: flags */
+ unsigned int flags; /* L: flags */
int id; /* I: worker id */
/*