summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/auditsc.c6
-rw-r--r--kernel/bpf/core.c1
-rw-r--r--kernel/bpf/ringbuf.c4
-rw-r--r--kernel/bpf/syscall.c4
-rw-r--r--kernel/capability.c104
-rw-r--r--kernel/crash_core.c4
-rw-r--r--kernel/dma/swiotlb.c11
-rw-r--r--kernel/events/core.c3
-rw-r--r--kernel/events/uprobes.c5
-rw-r--r--kernel/fail_function.c5
-rw-r--r--kernel/fork.c23
-rwxr-xr-xkernel/gen_kheaders.sh2
-rw-r--r--kernel/hung_task.c2
-rw-r--r--kernel/irq/irqdomain.c39
-rw-r--r--kernel/irq/msi.c27
-rw-r--r--kernel/kcov.c2
-rw-r--r--kernel/kexec.c4
-rw-r--r--kernel/kexec_core.c94
-rw-r--r--kernel/kexec_file.c11
-rw-r--r--kernel/kprobes.c27
-rw-r--r--kernel/ksysfs.c9
-rw-r--r--kernel/kthread.c5
-rw-r--r--kernel/livepatch/core.c62
-rw-r--r--kernel/module/main.c3
-rw-r--r--kernel/params.c3
-rw-r--r--kernel/pid_namespace.c5
-rw-r--r--kernel/pid_sysctl.h60
-rw-r--r--kernel/printk/index.c2
-rw-r--r--kernel/printk/internal.h45
-rw-r--r--kernel/printk/printk.c308
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/resource.c14
-rw-r--r--kernel/sched/fair.c14
-rw-r--r--kernel/sys.c33
-rw-r--r--kernel/sysctl.c43
-rw-r--r--kernel/trace/Kconfig20
-rw-r--r--kernel/trace/blktrace.c6
-rw-r--r--kernel/trace/kprobe_event_gen_test.c2
-rw-r--r--kernel/trace/ring_buffer.c51
-rw-r--r--kernel/trace/synth_event_gen_test.c2
-rw-r--r--kernel/trace/trace.c168
-rw-r--r--kernel/trace/trace.h8
-rw-r--r--kernel/trace/trace_eprobe.c95
-rw-r--r--kernel/trace/trace_events.c13
-rw-r--r--kernel/trace/trace_events_filter.c93
-rw-r--r--kernel/trace/trace_events_hist.c126
-rw-r--r--kernel/trace/trace_events_synth.c94
-rw-r--r--kernel/trace/trace_kprobe.c72
-rw-r--r--kernel/trace/trace_osnoise.c2
-rw-r--r--kernel/trace/trace_probe.c29
-rw-r--r--kernel/trace/trace_probe.h3
-rw-r--r--kernel/trace/trace_probe_kernel.h30
-rw-r--r--kernel/trace/trace_probe_tmpl.h48
-rw-r--r--kernel/trace/trace_seq.c23
-rw-r--r--kernel/trace/trace_synth.h1
-rw-r--r--kernel/trace/trace_uprobe.c13
-rw-r--r--kernel/tracepoint.c4
-rw-r--r--kernel/umh.c41
-rw-r--r--kernel/user_namespace.c2
59 files changed, 1256 insertions, 676 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 93d0b87f3283..addeed3df15d 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1295,15 +1295,11 @@ out:
static void audit_log_cap(struct audit_buffer *ab, char *prefix,
kernel_cap_t *cap)
{
- int i;
-
if (cap_isclear(*cap)) {
audit_log_format(ab, " %s=0", prefix);
return;
}
- audit_log_format(ab, " %s=", prefix);
- CAP_FOR_EACH_U32(i)
- audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]);
+ audit_log_format(ab, " %s=%016llx", prefix, cap->val);
}
static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 933869983e2a..b297e9f60ca1 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -34,6 +34,7 @@
#include <linux/log2.h>
#include <linux/bpf_verifier.h>
#include <linux/nodemask.h>
+#include <linux/nospec.h>
#include <linux/bpf_mem_alloc.h>
#include <linux/memcontrol.h>
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 80f4b4d88aaf..8732e0aadf36 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -269,7 +269,7 @@ static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma
if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE)
return -EPERM;
} else {
- vma->vm_flags &= ~VM_MAYWRITE;
+ vm_flags_clear(vma, VM_MAYWRITE);
}
/* remap_vmalloc_range() checks size and offset constraints */
return remap_vmalloc_range(vma, rb_map->rb,
@@ -290,7 +290,7 @@ static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma
*/
return -EPERM;
} else {
- vma->vm_flags &= ~VM_MAYWRITE;
+ vm_flags_clear(vma, VM_MAYWRITE);
}
/* remap_vmalloc_range() checks size and offset constraints */
return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e3fcdc9836a6..adc83cb82f37 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -895,10 +895,10 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
/* set default open/close callbacks */
vma->vm_ops = &bpf_map_default_vmops;
vma->vm_private_data = map;
- vma->vm_flags &= ~VM_MAYEXEC;
+ vm_flags_clear(vma, VM_MAYEXEC);
if (!(vma->vm_flags & VM_WRITE))
/* disallow re-mapping with PROT_WRITE */
- vma->vm_flags &= ~VM_MAYWRITE;
+ vm_flags_clear(vma, VM_MAYWRITE);
err = map->ops->map_mmap(map, vma);
if (err)
diff --git a/kernel/capability.c b/kernel/capability.c
index 339a44dfe2f4..3e058f41df32 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -20,13 +20,6 @@
#include <linux/user_namespace.h>
#include <linux/uaccess.h>
-/*
- * Leveraged for setting/resetting capabilities
- */
-
-const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
-EXPORT_SYMBOL(__cap_empty_set);
-
int file_caps_enabled = 1;
static int __init file_caps_disable(char *str)
@@ -151,6 +144,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
pid_t pid;
unsigned tocopy;
kernel_cap_t pE, pI, pP;
+ struct __user_cap_data_struct kdata[2];
ret = cap_validate_magic(header, &tocopy);
if ((dataptr == NULL) || (ret != 0))
@@ -163,42 +157,46 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
return -EINVAL;
ret = cap_get_target_pid(pid, &pE, &pI, &pP);
- if (!ret) {
- struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
- unsigned i;
-
- for (i = 0; i < tocopy; i++) {
- kdata[i].effective = pE.cap[i];
- kdata[i].permitted = pP.cap[i];
- kdata[i].inheritable = pI.cap[i];
- }
-
- /*
- * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
- * we silently drop the upper capabilities here. This
- * has the effect of making older libcap
- * implementations implicitly drop upper capability
- * bits when they perform a: capget/modify/capset
- * sequence.
- *
- * This behavior is considered fail-safe
- * behavior. Upgrading the application to a newer
- * version of libcap will enable access to the newer
- * capabilities.
- *
- * An alternative would be to return an error here
- * (-ERANGE), but that causes legacy applications to
- * unexpectedly fail; the capget/modify/capset aborts
- * before modification is attempted and the application
- * fails.
- */
- if (copy_to_user(dataptr, kdata, tocopy
- * sizeof(struct __user_cap_data_struct))) {
- return -EFAULT;
- }
- }
+ if (ret)
+ return ret;
- return ret;
+ /*
+ * Annoying legacy format with 64-bit capabilities exposed
+ * as two sets of 32-bit fields, so we need to split the
+ * capability values up.
+ */
+ kdata[0].effective = pE.val; kdata[1].effective = pE.val >> 32;
+ kdata[0].permitted = pP.val; kdata[1].permitted = pP.val >> 32;
+ kdata[0].inheritable = pI.val; kdata[1].inheritable = pI.val >> 32;
+
+ /*
+ * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
+ * we silently drop the upper capabilities here. This
+ * has the effect of making older libcap
+ * implementations implicitly drop upper capability
+ * bits when they perform a: capget/modify/capset
+ * sequence.
+ *
+ * This behavior is considered fail-safe
+ * behavior. Upgrading the application to a newer
+ * version of libcap will enable access to the newer
+ * capabilities.
+ *
+ * An alternative would be to return an error here
+ * (-ERANGE), but that causes legacy applications to
+ * unexpectedly fail; the capget/modify/capset aborts
+ * before modification is attempted and the application
+ * fails.
+ */
+ if (copy_to_user(dataptr, kdata, tocopy * sizeof(kdata[0])))
+ return -EFAULT;
+
+ return 0;
+}
+
+static kernel_cap_t mk_kernel_cap(u32 low, u32 high)
+{
+ return (kernel_cap_t) { (low | ((u64)high << 32)) & CAP_VALID_MASK };
}
/**
@@ -221,8 +219,8 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
*/
SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
{
- struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
- unsigned i, tocopy, copybytes;
+ struct __user_cap_data_struct kdata[2] = { { 0, }, };
+ unsigned tocopy, copybytes;
kernel_cap_t inheritable, permitted, effective;
struct cred *new;
int ret;
@@ -246,21 +244,9 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
if (copy_from_user(&kdata, data, copybytes))
return -EFAULT;
- for (i = 0; i < tocopy; i++) {
- effective.cap[i] = kdata[i].effective;
- permitted.cap[i] = kdata[i].permitted;
- inheritable.cap[i] = kdata[i].inheritable;
- }
- while (i < _KERNEL_CAPABILITY_U32S) {
- effective.cap[i] = 0;
- permitted.cap[i] = 0;
- inheritable.cap[i] = 0;
- i++;
- }
-
- effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
- permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
- inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+ effective = mk_kernel_cap(kdata[0].effective, kdata[1].effective);
+ permitted = mk_kernel_cap(kdata[0].permitted, kdata[1].permitted);
+ inheritable = mk_kernel_cap(kdata[0].inheritable, kdata[1].inheritable);
new = prepare_creds();
if (!new)
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 87ef6096823f..755f5f08ab38 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -455,8 +455,8 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(page, lru);
VMCOREINFO_OFFSET(page, _mapcount);
VMCOREINFO_OFFSET(page, private);
- VMCOREINFO_OFFSET(page, compound_dtor);
- VMCOREINFO_OFFSET(page, compound_order);
+ VMCOREINFO_OFFSET(folio, _folio_dtor);
+ VMCOREINFO_OFFSET(folio, _folio_order);
VMCOREINFO_OFFSET(page, compound_head);
VMCOREINFO_OFFSET(pglist_data, node_zones);
VMCOREINFO_OFFSET(pglist_data, nr_zones);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index a34c38bbe28f..03e3251cd9d2 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -156,14 +156,6 @@ setup_io_tlb_npages(char *str)
}
early_param("swiotlb", setup_io_tlb_npages);
-unsigned int swiotlb_max_segment(void)
-{
- if (!io_tlb_default_mem.nslabs)
- return 0;
- return rounddown(io_tlb_default_mem.nslabs << IO_TLB_SHIFT, PAGE_SIZE);
-}
-EXPORT_SYMBOL_GPL(swiotlb_max_segment);
-
unsigned long swiotlb_size_or_default(void)
{
return default_nslabs << IO_TLB_SHIFT;
@@ -300,7 +292,8 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
return;
}
-static void *swiotlb_memblock_alloc(unsigned long nslabs, unsigned int flags,
+static void __init *swiotlb_memblock_alloc(unsigned long nslabs,
+ unsigned int flags,
int (*remap)(void *tlb, unsigned long nslabs))
{
size_t bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7099c77bc53b..f79fd8b87f75 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6568,7 +6568,7 @@ aux_unlock:
* Since pinned accounting is per vm we cannot allow fork() to copy our
* vma.
*/
- vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
+ vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
vma->vm_ops = &perf_mmap_vmops;
if (event->pmu->event_mapped)
@@ -9418,6 +9418,7 @@ void perf_report_aux_output_id(struct perf_event *event, u64 hw_id)
perf_output_end(&handle);
}
+EXPORT_SYMBOL_GPL(perf_report_aux_output_id);
static int
__perf_event_account_interrupt(struct perf_event *event, int throttle)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index d9e357b7e17c..59887c69d54c 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -22,7 +22,6 @@
#include <linux/swap.h> /* folio_free_swap */
#include <linux/ptrace.h> /* user_enable_single_step */
#include <linux/kdebug.h> /* notifier mechanism */
-#include "../../mm/internal.h" /* munlock_vma_page */
#include <linux/percpu-rwsem.h>
#include <linux/task_work.h>
#include <linux/shmem_fs.h>
@@ -161,7 +160,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
int err;
struct mmu_notifier_range range;
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
addr + PAGE_SIZE);
if (new_page) {
@@ -1352,7 +1351,7 @@ static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
}
/*
- * Called from mmap_region/vma_adjust with mm->mmap_lock acquired.
+ * Called from mmap_region/vma_merge with mm->mmap_lock acquired.
*
* Currently we ignore all errors and always return 0, the callers
* can't handle the failure anyway.
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
index a7ccd2930c5f..d971a0189319 100644
--- a/kernel/fail_function.c
+++ b/kernel/fail_function.c
@@ -163,10 +163,7 @@ static void fei_debugfs_add_attr(struct fei_attr *attr)
static void fei_debugfs_remove_attr(struct fei_attr *attr)
{
- struct dentry *dir;
-
- dir = debugfs_lookup(attr->kp.symbol_name, fei_debugfs_dir);
- debugfs_remove_recursive(dir);
+ debugfs_lookup_and_remove(attr->kp.symbol_name, fei_debugfs_dir);
}
static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs)
diff --git a/kernel/fork.c b/kernel/fork.c
index 038b898dad52..f68954d05e89 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -472,7 +472,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
* orig->shared.rb may be modified concurrently, but the clone
* will be reinitialized.
*/
- *new = data_race(*orig);
+ data_race(memcpy(new, orig, sizeof(*new)));
INIT_LIST_HEAD(&new->anon_vma_chain);
dup_anon_vma_name(orig, new);
}
@@ -585,8 +585,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
int retval;
unsigned long charge = 0;
LIST_HEAD(uf);
- MA_STATE(old_mas, &oldmm->mm_mt, 0, 0);
- MA_STATE(mas, &mm->mm_mt, 0, 0);
+ VMA_ITERATOR(old_vmi, oldmm, 0);
+ VMA_ITERATOR(vmi, mm, 0);
uprobe_start_dup_mmap();
if (mmap_write_lock_killable(oldmm)) {
@@ -613,11 +613,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
goto out;
khugepaged_fork(mm, oldmm);
- retval = mas_expected_entries(&mas, oldmm->map_count);
+ retval = vma_iter_bulk_alloc(&vmi, oldmm->map_count);
if (retval)
goto out;
- mas_for_each(&old_mas, mpnt, ULONG_MAX) {
+ for_each_vma(old_vmi, mpnt) {
struct file *file;
if (mpnt->vm_flags & VM_DONTCOPY) {
@@ -659,7 +659,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
tmp->anon_vma = NULL;
} else if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
- tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
+ vm_flags_clear(tmp, VM_LOCKED_MASK);
file = tmp->vm_file;
if (file) {
struct address_space *mapping = file->f_mapping;
@@ -683,11 +683,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
hugetlb_dup_vma_private(tmp);
/* Link the vma into the MT */
- mas.index = tmp->vm_start;
- mas.last = tmp->vm_end - 1;
- mas_store(&mas, tmp);
- if (mas_is_err(&mas))
- goto fail_nomem_mas_store;
+ if (vma_iter_bulk_store(&vmi, tmp))
+ goto fail_nomem_vmi_store;
mm->map_count++;
if (!(tmp->vm_flags & VM_WIPEONFORK))
@@ -702,7 +699,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
/* a new mm has just been created */
retval = arch_dup_mmap(oldmm, mm);
loop_out:
- mas_destroy(&mas);
+ vma_iter_free(&vmi);
out:
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
@@ -712,7 +709,7 @@ fail_uprobe_end:
uprobe_end_dup_mmap();
return retval;
-fail_nomem_mas_store:
+fail_nomem_vmi_store:
unlink_anon_vmas(tmp);
fail_nomem_anon_vma_fork:
mpol_put(vma_policy(tmp));
diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index 81b97f0f6556..1ef9a87511f5 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -7,7 +7,7 @@ set -e
sfile="$(readlink -f "$0")"
outdir="$(pwd)"
tarfile=$1
-cpio_dir=$outdir/$tarfile.tmp
+cpio_dir=$outdir/${tarfile%/*}/.tmp_cpio_dir
dir_list="
include/
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index c71889f3f3fc..322813366c6c 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -142,6 +142,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
if (sysctl_hung_task_all_cpu_backtrace)
hung_task_show_all_bt = true;
+ if (!sysctl_hung_task_warnings)
+ pr_info("Future hung task reports are suppressed, see sysctl kernel.hung_task_warnings\n");
}
touch_nmi_watchdog();
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index aa5b7eeeceb8..7d4fc6479062 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -470,31 +470,6 @@ struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec,
EXPORT_SYMBOL_GPL(irq_find_matching_fwspec);
/**
- * irq_domain_check_msi_remap - Check whether all MSI irq domains implement
- * IRQ remapping
- *
- * Return: false if any MSI irq domain does not support IRQ remapping,
- * true otherwise (including if there is no MSI irq domain)
- */
-bool irq_domain_check_msi_remap(void)
-{
- struct irq_domain *h;
- bool ret = true;
-
- mutex_lock(&irq_domain_mutex);
- list_for_each_entry(h, &irq_domain_list, link) {
- if (irq_domain_is_msi(h) &&
- !irq_domain_hierarchical_is_msi_remap(h)) {
- ret = false;
- break;
- }
- }
- mutex_unlock(&irq_domain_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(irq_domain_check_msi_remap);
-
-/**
* irq_set_default_host() - Set a "default" irq domain
* @domain: default domain pointer
*
@@ -1890,20 +1865,6 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain)
if (domain->ops->alloc)
domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY;
}
-
-/**
- * irq_domain_hierarchical_is_msi_remap - Check if the domain or any
- * parent has MSI remapping support
- * @domain: domain pointer
- */
-bool irq_domain_hierarchical_is_msi_remap(struct irq_domain *domain)
-{
- for (; domain; domain = domain->parent) {
- if (irq_domain_is_msi_remap(domain))
- return true;
- }
- return false;
-}
#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */
/**
* irq_domain_get_irq_data - Get irq_data associated with @virq and @domain
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 783a3e6a0b10..d0f0389920c1 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -1627,3 +1627,30 @@ struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain)
{
return (struct msi_domain_info *)domain->host_data;
}
+
+/**
+ * msi_device_has_isolated_msi - True if the device has isolated MSI
+ * @dev: The device to check
+ *
+ * Isolated MSI means that HW modeled by an irq_domain on the path from the
+ * initiating device to the CPU will validate that the MSI message specifies an
+ * interrupt number that the device is authorized to trigger. This must block
+ * devices from triggering interrupts they are not authorized to trigger.
+ * Currently authorization means the MSI vector is one assigned to the device.
+ *
+ * This is interesting for securing VFIO use cases where a rouge MSI (eg created
+ * by abusing a normal PCI MemWr DMA) must not allow the VFIO userspace to
+ * impact outside its security domain, eg userspace triggering interrupts on
+ * kernel drivers, a VM triggering interrupts on the hypervisor, or a VM
+ * triggering interrupts on another VM.
+ */
+bool msi_device_has_isolated_msi(struct device *dev)
+{
+ struct irq_domain *domain = dev_get_msi_domain(dev);
+
+ for (; domain; domain = domain->parent)
+ if (domain->flags & IRQ_DOMAIN_FLAG_ISOLATED_MSI)
+ return true;
+ return arch_is_isolated_msi();
+}
+EXPORT_SYMBOL_GPL(msi_device_has_isolated_msi);
diff --git a/kernel/kcov.c b/kernel/kcov.c
index e5cd09fd8a05..84c717337df0 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -489,7 +489,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
goto exit;
}
spin_unlock_irqrestore(&kcov->lock, flags);
- vma->vm_flags |= VM_DONTEXPAND;
+ vm_flags_set(vma, VM_DONTEXPAND);
for (off = 0; off < size; off += PAGE_SIZE) {
page = vmalloc_to_page(kcov->area + off);
res = vm_insert_page(vma, vma->vm_start + off, page);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index cb8e6e6f983c..92d301f98776 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -190,10 +190,12 @@ out_unlock:
static inline int kexec_load_check(unsigned long nr_segments,
unsigned long flags)
{
+ int image_type = (flags & KEXEC_ON_CRASH) ?
+ KEXEC_TYPE_CRASH : KEXEC_TYPE_DEFAULT;
int result;
/* We only trust the superuser with rebooting the system. */
- if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ if (!kexec_load_permitted(image_type))
return -EPERM;
/* Permit LSMs and IMA to fail the kexec */
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index b1cf259854ca..3d578c6fefee 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -921,10 +921,64 @@ int kimage_load_segment(struct kimage *image,
return result;
}
+struct kexec_load_limit {
+ /* Mutex protects the limit count. */
+ struct mutex mutex;
+ int limit;
+};
+
+static struct kexec_load_limit load_limit_reboot = {
+ .mutex = __MUTEX_INITIALIZER(load_limit_reboot.mutex),
+ .limit = -1,
+};
+
+static struct kexec_load_limit load_limit_panic = {
+ .mutex = __MUTEX_INITIALIZER(load_limit_panic.mutex),
+ .limit = -1,
+};
+
struct kimage *kexec_image;
struct kimage *kexec_crash_image;
-int kexec_load_disabled;
+static int kexec_load_disabled;
+
#ifdef CONFIG_SYSCTL
+static int kexec_limit_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct kexec_load_limit *limit = table->data;
+ int val;
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = sizeof(val),
+ .mode = table->mode,
+ };
+ int ret;
+
+ if (write) {
+ ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+ if (ret)
+ return ret;
+
+ if (val < 0)
+ return -EINVAL;
+
+ mutex_lock(&limit->mutex);
+ if (limit->limit != -1 && val >= limit->limit)
+ ret = -EINVAL;
+ else
+ limit->limit = val;
+ mutex_unlock(&limit->mutex);
+
+ return ret;
+ }
+
+ mutex_lock(&limit->mutex);
+ val = limit->limit;
+ mutex_unlock(&limit->mutex);
+
+ return proc_dointvec(&tmp, write, buffer, lenp, ppos);
+}
+
static struct ctl_table kexec_core_sysctls[] = {
{
.procname = "kexec_load_disabled",
@@ -936,6 +990,18 @@ static struct ctl_table kexec_core_sysctls[] = {
.extra1 = SYSCTL_ONE,
.extra2 = SYSCTL_ONE,
},
+ {
+ .procname = "kexec_load_limit_panic",
+ .data = &load_limit_panic,
+ .mode = 0644,
+ .proc_handler = kexec_limit_handler,
+ },
+ {
+ .procname = "kexec_load_limit_reboot",
+ .data = &load_limit_reboot,
+ .mode = 0644,
+ .proc_handler = kexec_limit_handler,
+ },
{ }
};
@@ -947,6 +1013,32 @@ static int __init kexec_core_sysctl_init(void)
late_initcall(kexec_core_sysctl_init);
#endif
+bool kexec_load_permitted(int kexec_image_type)
+{
+ struct kexec_load_limit *limit;
+
+ /*
+ * Only the superuser can use the kexec syscall and if it has not
+ * been disabled.
+ */
+ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ return false;
+
+ /* Check limit counter and decrease it.*/
+ limit = (kexec_image_type == KEXEC_TYPE_CRASH) ?
+ &load_limit_panic : &load_limit_reboot;
+ mutex_lock(&limit->mutex);
+ if (!limit->limit) {
+ mutex_unlock(&limit->mutex);
+ return false;
+ }
+ if (limit->limit != -1)
+ limit->limit--;
+ mutex_unlock(&limit->mutex);
+
+ return true;
+}
+
/*
* No panic_cpu check version of crash_kexec(). This function is called
* only when panic_cpu holds the current CPU number; this is the only CPU
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index dd5983010b7b..f1a0e4e3fb5c 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -326,11 +326,13 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
unsigned long, cmdline_len, const char __user *, cmdline_ptr,
unsigned long, flags)
{
- int ret = 0, i;
+ int image_type = (flags & KEXEC_FILE_ON_CRASH) ?
+ KEXEC_TYPE_CRASH : KEXEC_TYPE_DEFAULT;
struct kimage **dest_image, *image;
+ int ret = 0, i;
/* We only trust the superuser with rebooting the system. */
- if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ if (!kexec_load_permitted(image_type))
return -EPERM;
/* Make sure we have a legal set of flags */
@@ -342,11 +344,12 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
if (!kexec_trylock())
return -EBUSY;
- dest_image = &kexec_image;
- if (flags & KEXEC_FILE_ON_CRASH) {
+ if (image_type == KEXEC_TYPE_CRASH) {
dest_image = &kexec_crash_image;
if (kexec_crash_image)
arch_kexec_unprotect_crashkres();
+ } else {
+ dest_image = &kexec_image;
}
if (flags & KEXEC_FILE_UNLOAD)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1c18ecf9f98b..00e177de91cc 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -458,7 +458,7 @@ static inline int kprobe_optready(struct kprobe *p)
}
/* Return true if the kprobe is disarmed. Note: p must be on hash list */
-static inline bool kprobe_disarmed(struct kprobe *p)
+bool kprobe_disarmed(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -555,17 +555,15 @@ static void do_unoptimize_kprobes(void)
/* See comment in do_optimize_kprobes() */
lockdep_assert_cpus_held();
- /* Unoptimization must be done anytime */
- if (list_empty(&unoptimizing_list))
- return;
+ if (!list_empty(&unoptimizing_list))
+ arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
- arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
- /* Loop on 'freeing_list' for disarming */
+ /* Loop on 'freeing_list' for disarming and removing from kprobe hash list */
list_for_each_entry_safe(op, tmp, &freeing_list, list) {
/* Switching from detour code to origin */
op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
- /* Disarm probes if marked disabled */
- if (kprobe_disabled(&op->kp))
+ /* Disarm probes if marked disabled and not gone */
+ if (kprobe_disabled(&op->kp) && !kprobe_gone(&op->kp))
arch_disarm_kprobe(&op->kp);
if (kprobe_unused(&op->kp)) {
/*
@@ -662,7 +660,7 @@ void wait_for_kprobe_optimizer(void)
mutex_unlock(&kprobe_mutex);
}
-static bool optprobe_queued_unopt(struct optimized_kprobe *op)
+bool optprobe_queued_unopt(struct optimized_kprobe *op)
{
struct optimized_kprobe *_op;
@@ -797,14 +795,13 @@ static void kill_optimized_kprobe(struct kprobe *p)
op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
if (kprobe_unused(p)) {
- /* Enqueue if it is unused */
- list_add(&op->list, &freeing_list);
/*
- * Remove unused probes from the hash list. After waiting
- * for synchronization, this probe is reclaimed.
- * (reclaiming is done by do_free_cleaned_kprobes().)
+ * Unused kprobe is on unoptimizing or freeing list. We move it
+ * to freeing_list and let the kprobe_optimizer() remove it from
+ * the kprobe hash list and free it.
*/
- hlist_del_rcu(&op->kp.hlist);
+ if (optprobe_queued_unopt(op))
+ list_move(&op->list, &freeing_list);
}
/* Don't touch the code, because it is already freed. */
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 2df00b789b90..0408aab80941 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -51,6 +51,14 @@ static ssize_t cpu_byteorder_show(struct kobject *kobj,
}
KERNEL_ATTR_RO(cpu_byteorder);
+/* address bits */
+static ssize_t address_bits_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%zu\n", sizeof(void *) * 8 /* CHAR_BIT */);
+}
+KERNEL_ATTR_RO(address_bits);
+
#ifdef CONFIG_UEVENT_HELPER
/* uevent helper program, used during early boot */
static ssize_t uevent_helper_show(struct kobject *kobj,
@@ -233,6 +241,7 @@ static struct attribute * kernel_attrs[] = {
&fscaps_attr.attr,
&uevent_seqnum_attr.attr,
&cpu_byteorder_attr.attr,
+ &address_bits_attr.attr,
#ifdef CONFIG_UEVENT_HELPER
&uevent_helper_attr.attr,
#endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index f97fd01a2932..7e6751b29101 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1382,6 +1382,10 @@ EXPORT_SYMBOL_GPL(kthread_flush_worker);
* Flush and destroy @worker. The simple flush is enough because the kthread
* worker API is used only in trivial scenarios. There are no multi-step state
* machines needed.
+ *
+ * Note that this function is not responsible for handling delayed work, so
+ * caller should be responsible for queuing or canceling all delayed work items
+ * before invoke this function.
*/
void kthread_destroy_worker(struct kthread_worker *worker)
{
@@ -1393,6 +1397,7 @@ void kthread_destroy_worker(struct kthread_worker *worker)
kthread_flush_worker(worker);
kthread_stop(task);
+ WARN_ON(!list_empty(&worker->delayed_work_list));
WARN_ON(!list_empty(&worker->work_list));
kfree(worker);
}
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index c973ed9e42f8..4bd2d5e10f20 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -260,6 +260,14 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab,
return 0;
}
+void __weak clear_relocate_add(Elf_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me)
+{
+}
+
/*
* At a high-level, there are two types of klp relocation sections: those which
* reference symbols which live in vmlinux; and those which reference symbols
@@ -283,10 +291,10 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab,
* the to-be-patched module to be loaded and patched sometime *after* the
* klp module is loaded.
*/
-int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
- const char *shstrtab, const char *strtab,
- unsigned int symndx, unsigned int secndx,
- const char *objname)
+static int klp_write_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
+ const char *shstrtab, const char *strtab,
+ unsigned int symndx, unsigned int secndx,
+ const char *objname, bool apply)
{
int cnt, ret;
char sec_objname[MODULE_NAME_LEN];
@@ -308,11 +316,26 @@ int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
if (strcmp(objname ? objname : "vmlinux", sec_objname))
return 0;
- ret = klp_resolve_symbols(sechdrs, strtab, symndx, sec, sec_objname);
- if (ret)
- return ret;
+ if (apply) {
+ ret = klp_resolve_symbols(sechdrs, strtab, symndx,
+ sec, sec_objname);
+ if (ret)
+ return ret;
+
+ return apply_relocate_add(sechdrs, strtab, symndx, secndx, pmod);
+ }
+
+ clear_relocate_add(sechdrs, strtab, symndx, secndx, pmod);
+ return 0;
+}
- return apply_relocate_add(sechdrs, strtab, symndx, secndx, pmod);
+int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
+ const char *shstrtab, const char *strtab,
+ unsigned int symndx, unsigned int secndx,
+ const char *objname)
+{
+ return klp_write_section_relocs(pmod, sechdrs, shstrtab, strtab, symndx,
+ secndx, objname, true);
}
/*
@@ -761,8 +784,9 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
func->old_sympos ? func->old_sympos : 1);
}
-static int klp_apply_object_relocs(struct klp_patch *patch,
- struct klp_object *obj)
+static int klp_write_object_relocs(struct klp_patch *patch,
+ struct klp_object *obj,
+ bool apply)
{
int i, ret;
struct klp_modinfo *info = patch->mod->klp_info;
@@ -773,10 +797,10 @@ static int klp_apply_object_relocs(struct klp_patch *patch,
if (!(sec->sh_flags & SHF_RELA_LIVEPATCH))
continue;
- ret = klp_apply_section_relocs(patch->mod, info->sechdrs,
+ ret = klp_write_section_relocs(patch->mod, info->sechdrs,
info->secstrings,
patch->mod->core_kallsyms.strtab,
- info->symndx, i, obj->name);
+ info->symndx, i, obj->name, apply);
if (ret)
return ret;
}
@@ -784,6 +808,18 @@ static int klp_apply_object_relocs(struct klp_patch *patch,
return 0;
}
+static int klp_apply_object_relocs(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ return klp_write_object_relocs(patch, obj, true);
+}
+
+static void klp_clear_object_relocs(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ klp_write_object_relocs(patch, obj, false);
+}
+
/* parts of the initialization that is done only when the object is loaded */
static int klp_init_object_loaded(struct klp_patch *patch,
struct klp_object *obj)
@@ -1171,7 +1207,7 @@ static void klp_cleanup_module_patches_limited(struct module *mod,
klp_unpatch_object(obj);
klp_post_unpatch_callback(obj);
-
+ klp_clear_object_relocs(patch, obj);
klp_free_object_loaded(obj);
break;
}
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 4ac3fe43e6c8..d3be89de706d 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -17,6 +17,7 @@
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/kernel_read_file.h>
+#include <linux/kstrtox.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/elf.h>
@@ -2675,7 +2676,7 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname,
int ret;
if (strcmp(param, "async_probe") == 0) {
- if (strtobool(val, &mod->async_probe_requested))
+ if (kstrtobool(val, &mod->async_probe_requested))
mod->async_probe_requested = true;
return 0;
}
diff --git a/kernel/params.c b/kernel/params.c
index 14d66070757b..6e34ca89ebae 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -4,6 +4,7 @@
*/
#include <linux/kernel.h>
+#include <linux/kstrtox.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/module.h>
@@ -310,7 +311,7 @@ int param_set_bool(const char *val, const struct kernel_param *kp)
if (!val) val = "1";
/* One of =[yYnN01] */
- return strtobool(val, kp->arg);
+ return kstrtobool(val, kp->arg);
}
EXPORT_SYMBOL(param_set_bool);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index fc21c5d5fd5d..46e0d5a3f91f 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -23,6 +23,7 @@
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
#include <linux/idr.h>
+#include "pid_sysctl.h"
static DEFINE_MUTEX(pid_caches_mutex);
static struct kmem_cache *pid_ns_cachep;
@@ -110,6 +111,8 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->ucounts = ucounts;
ns->pid_allocated = PIDNS_ADDING;
+ initialize_memfd_noexec_scope(ns);
+
return ns;
out_free_idr:
@@ -472,6 +475,8 @@ static __init int pid_namespaces_init(void)
#ifdef CONFIG_CHECKPOINT_RESTORE
register_sysctl_paths(kern_path, pid_ns_ctl_table);
#endif
+
+ register_pid_ns_sysctl_table_vm();
return 0;
}
diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h
new file mode 100644
index 000000000000..e22d072e1e24
--- /dev/null
+++ b/kernel/pid_sysctl.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_PID_SYSCTL_H
+#define LINUX_PID_SYSCTL_H
+
+#include <linux/pid_namespace.h>
+
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
+static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns)
+{
+ ns->memfd_noexec_scope =
+ task_active_pid_ns(current)->memfd_noexec_scope;
+}
+
+static int pid_mfd_noexec_dointvec_minmax(struct ctl_table *table,
+ int write, void *buf, size_t *lenp, loff_t *ppos)
+{
+ struct pid_namespace *ns = task_active_pid_ns(current);
+ struct ctl_table table_copy;
+
+ if (write && !ns_capable(ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ table_copy = *table;
+ if (ns != &init_pid_ns)
+ table_copy.data = &ns->memfd_noexec_scope;
+
+ /*
+ * set minimum to current value, the effect is only bigger
+ * value is accepted.
+ */
+ if (*(int *)table_copy.data > *(int *)table_copy.extra1)
+ table_copy.extra1 = table_copy.data;
+
+ return proc_dointvec_minmax(&table_copy, write, buf, lenp, ppos);
+}
+
+static struct ctl_table pid_ns_ctl_table_vm[] = {
+ {
+ .procname = "memfd_noexec",
+ .data = &init_pid_ns.memfd_noexec_scope,
+ .maxlen = sizeof(init_pid_ns.memfd_noexec_scope),
+ .mode = 0644,
+ .proc_handler = pid_mfd_noexec_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ { }
+};
+static struct ctl_path vm_path[] = { { .procname = "vm", }, { } };
+static inline void register_pid_ns_sysctl_table_vm(void)
+{
+ register_sysctl_paths(vm_path, pid_ns_ctl_table_vm);
+}
+#else
+static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns) {}
+static inline void set_memfd_noexec_scope(struct pid_namespace *ns) {}
+static inline void register_pid_ns_sysctl_table_vm(void) {}
+#endif
+
+#endif /* LINUX_PID_SYSCTL_H */
diff --git a/kernel/printk/index.c b/kernel/printk/index.c
index c85be186a783..a6b27526baaf 100644
--- a/kernel/printk/index.c
+++ b/kernel/printk/index.c
@@ -145,7 +145,7 @@ static void pi_create_file(struct module *mod)
#ifdef CONFIG_MODULES
static void pi_remove_file(struct module *mod)
{
- debugfs_remove(debugfs_lookup(pi_get_module_name(mod), dfs_index));
+ debugfs_lookup_and_remove(pi_get_module_name(mod), dfs_index);
}
static int pi_module_notify(struct notifier_block *nb, unsigned long op,
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index d947ca6c84f9..2a17704136f1 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -14,6 +14,21 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
#ifdef CONFIG_PRINTK
+#ifdef CONFIG_PRINTK_CALLER
+#define PRINTK_PREFIX_MAX 48
+#else
+#define PRINTK_PREFIX_MAX 32
+#endif
+
+/*
+ * the maximum size of a formatted record (i.e. with prefix added
+ * per line and dropped messages or in extended message format)
+ */
+#define PRINTK_MESSAGE_MAX 2048
+
+/* the maximum size allowed to be reserved for a record */
+#define PRINTKRB_RECORD_MAX 1024
+
/* Flags for a single printk record. */
enum printk_info_flags {
LOG_NEWLINE = 2, /* text ended with a newline */
@@ -48,6 +63,10 @@ u16 printk_parse_prefix(const char *text, int *level,
enum printk_info_flags *flags);
#else
+#define PRINTK_PREFIX_MAX 0
+#define PRINTK_MESSAGE_MAX 0
+#define PRINTKRB_RECORD_MAX 0
+
/*
* In !PRINTK builds we still export console_sem
* semaphore and some of console functions (console_unlock()/etc.), so
@@ -58,3 +77,29 @@ u16 printk_parse_prefix(const char *text, int *level,
static inline bool printk_percpu_data_ready(void) { return false; }
#endif /* CONFIG_PRINTK */
+
+/**
+ * struct printk_buffers - Buffers to read/format/output printk messages.
+ * @outbuf: After formatting, contains text to output.
+ * @scratchbuf: Used as temporary ringbuffer reading and string-print space.
+ */
+struct printk_buffers {
+ char outbuf[PRINTK_MESSAGE_MAX];
+ char scratchbuf[PRINTKRB_RECORD_MAX];
+};
+
+/**
+ * struct printk_message - Container for a prepared printk message.
+ * @pbufs: printk buffers used to prepare the message.
+ * @outbuf_len: The length of prepared text in @pbufs->outbuf to output. This
+ * does not count the terminator. A value of 0 means there is
+ * nothing to output and this record should be skipped.
+ * @seq: The sequence number of the record used for @pbufs->outbuf.
+ * @dropped: The number of dropped records from reading @seq.
+ */
+struct printk_message {
+ struct printk_buffers *pbufs;
+ unsigned int outbuf_len;
+ u64 seq;
+ unsigned long dropped;
+};
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 94f136b25f6a..fd0c9f913940 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -466,21 +466,6 @@ static struct latched_seq clear_seq = {
.val[1] = 0,
};
-#ifdef CONFIG_PRINTK_CALLER
-#define PREFIX_MAX 48
-#else
-#define PREFIX_MAX 32
-#endif
-
-/* the maximum size of a formatted record (i.e. with prefix added per line) */
-#define CONSOLE_LOG_MAX 1024
-
-/* the maximum size for a dropped text message */
-#define DROPPED_TEXT_MAX 64
-
-/* the maximum size allowed to be reserved for a record */
-#define LOG_LINE_MAX (CONSOLE_LOG_MAX - PREFIX_MAX)
-
#define LOG_LEVEL(v) ((v) & 0x07)
#define LOG_FACILITY(v) ((v) >> 3 & 0xff)
@@ -711,16 +696,15 @@ out:
return len;
}
+static bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
+ bool is_extended, bool may_supress);
+
/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
atomic64_t seq;
struct ratelimit_state rs;
struct mutex lock;
- char buf[CONSOLE_EXT_LOG_MAX];
-
- struct printk_info info;
- char text_buf[CONSOLE_EXT_LOG_MAX];
- struct printk_record record;
+ struct printk_buffers pbufs;
};
static __printf(3, 4) __cold
@@ -746,7 +730,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
size_t len = iov_iter_count(from);
ssize_t ret = len;
- if (!user || len > LOG_LINE_MAX)
+ if (!user || len > PRINTKRB_RECORD_MAX)
return -EINVAL;
/* Ignore when user logging is disabled. */
@@ -802,8 +786,10 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
struct devkmsg_user *user = file->private_data;
- struct printk_record *r = &user->record;
- size_t len;
+ char *outbuf = &user->pbufs.outbuf[0];
+ struct printk_message pmsg = {
+ .pbufs = &user->pbufs,
+ };
ssize_t ret;
if (!user)
@@ -813,7 +799,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
if (ret)
return ret;
- if (!prb_read_valid(prb, atomic64_read(&user->seq), r)) {
+ if (!printk_get_next_message(&pmsg, atomic64_read(&user->seq), true, false)) {
if (file->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
goto out;
@@ -830,36 +816,31 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
* This pairs with __wake_up_klogd:A.
*/
ret = wait_event_interruptible(log_wait,
- prb_read_valid(prb,
- atomic64_read(&user->seq), r)); /* LMM(devkmsg_read:A) */
+ printk_get_next_message(&pmsg, atomic64_read(&user->seq), true,
+ false)); /* LMM(devkmsg_read:A) */
if (ret)
goto out;
}
- if (r->info->seq != atomic64_read(&user->seq)) {
+ if (pmsg.dropped) {
/* our last seen message is gone, return error and reset */
- atomic64_set(&user->seq, r->info->seq);
+ atomic64_set(&user->seq, pmsg.seq);
ret = -EPIPE;
goto out;
}
- len = info_print_ext_header(user->buf, sizeof(user->buf), r->info);
- len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len,
- &r->text_buf[0], r->info->text_len,
- &r->info->dev_info);
-
- atomic64_set(&user->seq, r->info->seq + 1);
+ atomic64_set(&user->seq, pmsg.seq + 1);
- if (len > count) {
+ if (pmsg.outbuf_len > count) {
ret = -EINVAL;
goto out;
}
- if (copy_to_user(buf, user->buf, len)) {
+ if (copy_to_user(buf, outbuf, pmsg.outbuf_len)) {
ret = -EFAULT;
goto out;
}
- ret = len;
+ ret = pmsg.outbuf_len;
out:
mutex_unlock(&user->lock);
return ret;
@@ -953,9 +934,6 @@ static int devkmsg_open(struct inode *inode, struct file *file)
mutex_init(&user->lock);
- prb_rec_init_rd(&user->record, &user->info,
- &user->text_buf[0], sizeof(user->text_buf));
-
atomic64_set(&user->seq, prb_first_valid_seq(prb));
file->private_data = user;
@@ -1150,7 +1128,7 @@ static unsigned int __init add_to_rb(struct printk_ringbuffer *rb,
return prb_record_text_space(&e);
}
-static char setup_text_buf[LOG_LINE_MAX] __initdata;
+static char setup_text_buf[PRINTKRB_RECORD_MAX] __initdata;
void __init setup_log_buf(int early)
{
@@ -1416,7 +1394,7 @@ static size_t record_print_text(struct printk_record *r, bool syslog,
size_t text_len = r->info->text_len;
size_t buf_size = r->text_buf_size;
char *text = r->text_buf;
- char prefix[PREFIX_MAX];
+ char prefix[PRINTK_PREFIX_MAX];
bool truncated = false;
size_t prefix_len;
size_t line_len;
@@ -1515,7 +1493,7 @@ static size_t get_record_print_text_size(struct printk_info *info,
unsigned int line_count,
bool syslog, bool time)
{
- char prefix[PREFIX_MAX];
+ char prefix[PRINTK_PREFIX_MAX];
size_t prefix_len;
prefix_len = info_print_prefix(info, syslog, time, prefix);
@@ -1581,11 +1559,11 @@ static int syslog_print(char __user *buf, int size)
int len = 0;
u64 seq;
- text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL);
+ text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
- prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);
+ prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX);
mutex_lock(&syslog_lock);
@@ -1686,7 +1664,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
u64 seq;
bool time;
- text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL);
+ text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
@@ -1698,7 +1676,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1,
size, true, time);
- prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);
+ prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX);
len = 0;
prb_for_each_record(seq, prb, seq, &r) {
@@ -2013,27 +1991,6 @@ static int console_trylock_spinning(void)
}
/*
- * Call the specified console driver, asking it to write out the specified
- * text and length. If @dropped_text is non-NULL and any records have been
- * dropped, a dropped message will be written out first.
- */
-static void call_console_driver(struct console *con, const char *text, size_t len,
- char *dropped_text)
-{
- size_t dropped_len;
-
- if (con->dropped && dropped_text) {
- dropped_len = snprintf(dropped_text, DROPPED_TEXT_MAX,
- "** %lu printk messages dropped **\n",
- con->dropped);
- con->dropped = 0;
- con->write(con, dropped_text, dropped_len);
- }
-
- con->write(con, text, len);
-}
-
-/*
* Recursion is tracked separately on each CPU. If NMIs are supported, an
* additional NMI context per CPU is also separately tracked. Until per-CPU
* is available, a separate "early tracking" is performed.
@@ -2243,8 +2200,8 @@ int vprintk_store(int facility, int level,
reserve_size = vsnprintf(&prefix_buf[0], sizeof(prefix_buf), fmt, args2) + 1;
va_end(args2);
- if (reserve_size > LOG_LINE_MAX)
- reserve_size = LOG_LINE_MAX;
+ if (reserve_size > PRINTKRB_RECORD_MAX)
+ reserve_size = PRINTKRB_RECORD_MAX;
/* Extract log level or control flags. */
if (facility == 0)
@@ -2258,7 +2215,7 @@ int vprintk_store(int facility, int level,
if (flags & LOG_CONT) {
prb_rec_init_wr(&r, reserve_size);
- if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) {
+ if (prb_reserve_in_last(&e, prb, &r, caller_id, PRINTKRB_RECORD_MAX)) {
text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size,
facility, &flags, fmt, args);
r.info->text_len += text_len;
@@ -2389,8 +2346,6 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
#else /* CONFIG_PRINTK */
-#define CONSOLE_LOG_MAX 0
-#define DROPPED_TEXT_MAX 0
#define printk_time false
#define prb_read_valid(rb, seq, r) false
@@ -2414,10 +2369,6 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,
struct dev_printk_info *dev_info) { return 0; }
static void console_lock_spinning_enable(void) { }
static int console_lock_spinning_disable_and_check(int cookie) { return 0; }
-static void call_console_driver(struct console *con, const char *text, size_t len,
- char *dropped_text)
-{
-}
static bool suppress_message_printing(int level) { return false; }
static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; }
static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; }
@@ -2744,16 +2695,136 @@ static void __console_unlock(void)
}
/*
- * Print one record for the given console. The record printed is whatever
- * record is the next available record for the given console.
+ * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". This
+ * is achieved by shifting the existing message over and inserting the dropped
+ * message.
+ *
+ * @pmsg is the printk message to prepend.
*
- * @text is a buffer of size CONSOLE_LOG_MAX.
+ * @dropped is the dropped count to report in the dropped message.
*
- * If extended messages should be printed, @ext_text is a buffer of size
- * CONSOLE_EXT_LOG_MAX. Otherwise @ext_text must be NULL.
+ * If the message text in @pmsg->pbufs->outbuf does not have enough space for
+ * the dropped message, the message text will be sufficiently truncated.
*
- * If dropped messages should be printed, @dropped_text is a buffer of size
- * DROPPED_TEXT_MAX. Otherwise @dropped_text must be NULL.
+ * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated.
+ */
+#ifdef CONFIG_PRINTK
+static void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
+{
+ struct printk_buffers *pbufs = pmsg->pbufs;
+ const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
+ const size_t outbuf_sz = sizeof(pbufs->outbuf);
+ char *scratchbuf = &pbufs->scratchbuf[0];
+ char *outbuf = &pbufs->outbuf[0];
+ size_t len;
+
+ len = scnprintf(scratchbuf, scratchbuf_sz,
+ "** %lu printk messages dropped **\n", dropped);
+
+ /*
+ * Make sure outbuf is sufficiently large before prepending.
+ * Keep at least the prefix when the message must be truncated.
+ * It is a rather theoretical problem when someone tries to
+ * use a minimalist buffer.
+ */
+ if (WARN_ON_ONCE(len + PRINTK_PREFIX_MAX >= outbuf_sz))
+ return;
+
+ if (pmsg->outbuf_len + len >= outbuf_sz) {
+ /* Truncate the message, but keep it terminated. */
+ pmsg->outbuf_len = outbuf_sz - (len + 1);
+ outbuf[pmsg->outbuf_len] = 0;
+ }
+
+ memmove(outbuf + len, outbuf, pmsg->outbuf_len + 1);
+ memcpy(outbuf, scratchbuf, len);
+ pmsg->outbuf_len += len;
+}
+#else
+#define console_prepend_dropped(pmsg, dropped)
+#endif /* CONFIG_PRINTK */
+
+/*
+ * Read and format the specified record (or a later record if the specified
+ * record is not available).
+ *
+ * @pmsg will contain the formatted result. @pmsg->pbufs must point to a
+ * struct printk_buffers.
+ *
+ * @seq is the record to read and format. If it is not available, the next
+ * valid record is read.
+ *
+ * @is_extended specifies if the message should be formatted for extended
+ * console output.
+ *
+ * @may_supress specifies if records may be skipped based on loglevel.
+ *
+ * Returns false if no record is available. Otherwise true and all fields
+ * of @pmsg are valid. (See the documentation of struct printk_message
+ * for information about the @pmsg fields.)
+ */
+static bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
+ bool is_extended, bool may_suppress)
+{
+ static int panic_console_dropped;
+
+ struct printk_buffers *pbufs = pmsg->pbufs;
+ const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
+ const size_t outbuf_sz = sizeof(pbufs->outbuf);
+ char *scratchbuf = &pbufs->scratchbuf[0];
+ char *outbuf = &pbufs->outbuf[0];
+ struct printk_info info;
+ struct printk_record r;
+ size_t len = 0;
+
+ /*
+ * Formatting extended messages requires a separate buffer, so use the
+ * scratch buffer to read in the ringbuffer text.
+ *
+ * Formatting normal messages is done in-place, so read the ringbuffer
+ * text directly into the output buffer.
+ */
+ if (is_extended)
+ prb_rec_init_rd(&r, &info, scratchbuf, scratchbuf_sz);
+ else
+ prb_rec_init_rd(&r, &info, outbuf, outbuf_sz);
+
+ if (!prb_read_valid(prb, seq, &r))
+ return false;
+
+ pmsg->seq = r.info->seq;
+ pmsg->dropped = r.info->seq - seq;
+
+ /*
+ * Check for dropped messages in panic here so that printk
+ * suppression can occur as early as possible if necessary.
+ */
+ if (pmsg->dropped &&
+ panic_in_progress() &&
+ panic_console_dropped++ > 10) {
+ suppress_panic_printk = 1;
+ pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n");
+ }
+
+ /* Skip record that has level above the console loglevel. */
+ if (may_suppress && suppress_message_printing(r.info->level))
+ goto out;
+
+ if (is_extended) {
+ len = info_print_ext_header(outbuf, outbuf_sz, r.info);
+ len += msg_print_ext_body(outbuf + len, outbuf_sz - len,
+ &r.text_buf[0], r.info->text_len, &r.info->dev_info);
+ } else {
+ len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time);
+ }
+out:
+ pmsg->outbuf_len = len;
+ return true;
+}
+
+/*
+ * Print one record for the given console. The record printed is whatever
+ * record is the next available record for the given console.
*
* @handover will be set to true if a printk waiter has taken over the
* console_lock, in which case the caller is no longer holding both the
@@ -2766,46 +2837,33 @@ static void __console_unlock(void)
*
* Requires the console_lock and the SRCU read lock.
*/
-static bool console_emit_next_record(struct console *con, char *text, char *ext_text,
- char *dropped_text, bool *handover, int cookie)
+static bool console_emit_next_record(struct console *con, bool *handover, int cookie)
{
- static int panic_console_dropped;
- struct printk_info info;
- struct printk_record r;
- unsigned long flags;
- char *write_text;
- size_t len;
+ static struct printk_buffers pbufs;
- prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);
+ bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED;
+ char *outbuf = &pbufs.outbuf[0];
+ struct printk_message pmsg = {
+ .pbufs = &pbufs,
+ };
+ unsigned long flags;
*handover = false;
- if (!prb_read_valid(prb, con->seq, &r))
+ if (!printk_get_next_message(&pmsg, con->seq, is_extended, true))
return false;
- if (con->seq != r.info->seq) {
- con->dropped += r.info->seq - con->seq;
- con->seq = r.info->seq;
- if (panic_in_progress() && panic_console_dropped++ > 10) {
- suppress_panic_printk = 1;
- pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n");
- }
- }
+ con->dropped += pmsg.dropped;
- /* Skip record that has level above the console loglevel. */
- if (suppress_message_printing(r.info->level)) {
- con->seq++;
+ /* Skip messages of formatted length 0. */
+ if (pmsg.outbuf_len == 0) {
+ con->seq = pmsg.seq + 1;
goto skip;
}
- if (ext_text) {
- write_text = ext_text;
- len = info_print_ext_header(ext_text, CONSOLE_EXT_LOG_MAX, r.info);
- len += msg_print_ext_body(ext_text + len, CONSOLE_EXT_LOG_MAX - len,
- &r.text_buf[0], r.info->text_len, &r.info->dev_info);
- } else {
- write_text = text;
- len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time);
+ if (con->dropped && !is_extended) {
+ console_prepend_dropped(&pmsg, con->dropped);
+ con->dropped = 0;
}
/*
@@ -2821,11 +2879,15 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_
printk_safe_enter_irqsave(flags);
console_lock_spinning_enable();
- stop_critical_timings(); /* don't trace print latency */
- call_console_driver(con, write_text, len, dropped_text);
+ /* Do not trace print latency. */
+ stop_critical_timings();
+
+ /* Write everything out to the hardware. */
+ con->write(con, outbuf, pmsg.outbuf_len);
+
start_critical_timings();
- con->seq++;
+ con->seq = pmsg.seq + 1;
*handover = console_lock_spinning_disable_and_check(cookie);
printk_safe_exit_irqrestore(flags);
@@ -2858,9 +2920,6 @@ skip:
*/
static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover)
{
- static char dropped_text[DROPPED_TEXT_MAX];
- static char ext_text[CONSOLE_EXT_LOG_MAX];
- static char text[CONSOLE_LOG_MAX];
bool any_usable = false;
struct console *con;
bool any_progress;
@@ -2880,16 +2939,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
continue;
any_usable = true;
- if (console_srcu_read_flags(con) & CON_EXTENDED) {
- /* Extended consoles do not print "dropped messages". */
- progress = console_emit_next_record(con, &text[0],
- &ext_text[0], NULL,
- handover, cookie);
- } else {
- progress = console_emit_next_record(con, &text[0],
- NULL, &dropped_text[0],
- handover, cookie);
- }
+ progress = console_emit_next_record(con, handover, cookie);
/*
* If a handover has occurred, the SRCU read lock
diff --git a/kernel/relay.c b/kernel/relay.c
index ef12532168d9..9aa70ae53d24 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -91,7 +91,7 @@ static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
return -EINVAL;
vma->vm_ops = &relay_file_mmap_ops;
- vma->vm_flags |= VM_DONTEXPAND;
+ vm_flags_set(vma, VM_DONTEXPAND);
vma->vm_private_data = buf;
return 0;
diff --git a/kernel/resource.c b/kernel/resource.c
index ddbbacb9fb50..b1763b2fd7ef 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1343,20 +1343,6 @@ retry:
continue;
}
- /*
- * All memory regions added from memory-hotplug path have the
- * flag IORESOURCE_SYSTEM_RAM. If the resource does not have
- * this flag, we know that we are dealing with a resource coming
- * from HMM/devm. HMM/devm use another mechanism to add/release
- * a resource. This goes via devm_request_mem_region and
- * devm_release_mem_region.
- * HMM/devm take care to release their resources when they want,
- * so if we are dealing with them, let us just back off here.
- */
- if (!(res->flags & IORESOURCE_SYSRAM)) {
- break;
- }
-
if (!(res->flags & IORESOURCE_MEM))
break;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ff4dbbae3b10..7a1b1f855b96 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2938,11 +2938,11 @@ static void task_numa_work(struct callback_head *work)
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
u64 runtime = p->se.sum_exec_runtime;
- MA_STATE(mas, &mm->mm_mt, 0, 0);
struct vm_area_struct *vma;
unsigned long start, end;
unsigned long nr_pte_updates = 0;
long pages, virtpages;
+ struct vma_iterator vmi;
SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
@@ -2995,16 +2995,16 @@ static void task_numa_work(struct callback_head *work)
if (!mmap_read_trylock(mm))
return;
- mas_set(&mas, start);
- vma = mas_find(&mas, ULONG_MAX);
+ vma_iter_init(&vmi, mm, start);
+ vma = vma_next(&vmi);
if (!vma) {
reset_ptenuma_scan(p);
start = 0;
- mas_set(&mas, start);
- vma = mas_find(&mas, ULONG_MAX);
+ vma_iter_set(&vmi, start);
+ vma = vma_next(&vmi);
}
- for (; vma; vma = mas_find(&mas, ULONG_MAX)) {
+ do {
if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
continue;
@@ -3051,7 +3051,7 @@ static void task_numa_work(struct callback_head *work)
cond_resched();
} while (end != vma->vm_end);
- }
+ } for_each_vma(vmi, vma);
out:
/*
diff --git a/kernel/sys.c b/kernel/sys.c
index 88b31f096fb2..495cd87d9bf4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2350,6 +2350,33 @@ static int prctl_set_vma(unsigned long opt, unsigned long start,
}
#endif /* CONFIG_ANON_VMA_NAME */
+static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+
+ if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN))
+ return -EINVAL;
+
+ if (bits & PR_MDWE_REFUSE_EXEC_GAIN)
+ set_bit(MMF_HAS_MDWE, &current->mm->flags);
+ else if (test_bit(MMF_HAS_MDWE, &current->mm->flags))
+ return -EPERM; /* Cannot unset the flag */
+
+ return 0;
+}
+
+static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+
+ return test_bit(MMF_HAS_MDWE, &current->mm->flags) ?
+ PR_MDWE_REFUSE_EXEC_GAIN : 0;
+}
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@@ -2625,6 +2652,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = sched_core_share_pid(arg2, arg3, arg4, arg5);
break;
#endif
+ case PR_SET_MDWE:
+ error = prctl_set_mdwe(arg2, arg3, arg4, arg5);
+ break;
+ case PR_GET_MDWE:
+ error = prctl_get_mdwe(arg2, arg3, arg4, arg5);
+ break;
case PR_SET_VMA:
error = prctl_set_vma(arg2, arg3, arg4, arg5);
break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 137d4abe3eda..1c240d2c99bc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -425,21 +425,6 @@ static void proc_put_char(void **buf, size_t *size, char c)
}
}
-static int do_proc_dobool_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
-{
- if (write) {
- *(bool *)valp = *lvalp;
- } else {
- int val = *(bool *)valp;
-
- *lvalp = (unsigned long)val;
- *negp = false;
- }
- return 0;
-}
-
static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
int *valp,
int write, void *data)
@@ -710,16 +695,36 @@ int do_proc_douintvec(struct ctl_table *table, int write,
* @lenp: the size of the user buffer
* @ppos: file position
*
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
+ * Reads/writes one integer value from/to the user buffer,
+ * treated as an ASCII string.
+ *
+ * table->data must point to a bool variable and table->maxlen must
+ * be sizeof(bool).
*
* Returns 0 on success.
*/
int proc_dobool(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
- return do_proc_dointvec(table, write, buffer, lenp, ppos,
- do_proc_dobool_conv, NULL);
+ struct ctl_table tmp;
+ bool *data = table->data;
+ int res, val;
+
+ /* Do not support arrays yet. */
+ if (table->maxlen != sizeof(bool))
+ return -EINVAL;
+
+ tmp = *table;
+ tmp.maxlen = sizeof(val);
+ tmp.data = &val;
+
+ val = READ_ONCE(*data);
+ res = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+ if (res)
+ return res;
+ if (write)
+ WRITE_ONCE(*data, val);
+ return 0;
}
/**
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index caf32389faf3..a856d4a34c67 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -242,7 +242,7 @@ config DYNAMIC_FTRACE
enabled, and the functions not enabled will not affect
performance of the system.
- See the files in /sys/kernel/debug/tracing:
+ See the files in /sys/kernel/tracing:
available_filter_functions
set_ftrace_filter
set_ftrace_notrace
@@ -306,7 +306,7 @@ config STACK_TRACER
select KALLSYMS
help
This special tracer records the maximum stack footprint of the
- kernel and displays it in /sys/kernel/debug/tracing/stack_trace.
+ kernel and displays it in /sys/kernel/tracing/stack_trace.
This tracer works by hooking into every function call that the
kernel executes, and keeping a maximum stack depth value and
@@ -346,7 +346,7 @@ config IRQSOFF_TRACER
disabled by default and can be runtime (re-)started
via:
- echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
+ echo 0 > /sys/kernel/tracing/tracing_max_latency
(Note that kernel size and overhead increase with this option
enabled. This option and the preempt-off timing option can be
@@ -370,7 +370,7 @@ config PREEMPT_TRACER
disabled by default and can be runtime (re-)started
via:
- echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
+ echo 0 > /sys/kernel/tracing/tracing_max_latency
(Note that kernel size and overhead increase with this option
enabled. This option and the irqs-off timing option can be
@@ -522,7 +522,7 @@ config TRACER_SNAPSHOT
Allow tracing users to take snapshot of the current buffer using the
ftrace interface, e.g.:
- echo 1 > /sys/kernel/debug/tracing/snapshot
+ echo 1 > /sys/kernel/tracing/snapshot
cat snapshot
config TRACER_SNAPSHOT_PER_CPU_SWAP
@@ -534,7 +534,7 @@ config TRACER_SNAPSHOT_PER_CPU_SWAP
full swap (all buffers). If this is set, then the following is
allowed:
- echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot
+ echo 1 > /sys/kernel/tracing/per_cpu/cpu2/snapshot
After which, only the tracing buffer for CPU 2 was swapped with
the main tracing buffer, and the other CPU buffers remain the same.
@@ -581,7 +581,7 @@ config PROFILE_ANNOTATED_BRANCHES
This tracer profiles all likely and unlikely macros
in the kernel. It will display the results in:
- /sys/kernel/debug/tracing/trace_stat/branch_annotated
+ /sys/kernel/tracing/trace_stat/branch_annotated
Note: this will add a significant overhead; only turn this
on if you need to profile the system's use of these macros.
@@ -594,7 +594,7 @@ config PROFILE_ALL_BRANCHES
taken in the kernel is recorded whether it hit or miss.
The results will be displayed in:
- /sys/kernel/debug/tracing/trace_stat/branch_all
+ /sys/kernel/tracing/trace_stat/branch_all
This option also enables the likely/unlikely profiler.
@@ -645,8 +645,8 @@ config BLK_DEV_IO_TRACE
Tracing also is possible using the ftrace interface, e.g.:
echo 1 > /sys/block/sda/sda1/trace/enable
- echo blk > /sys/kernel/debug/tracing/current_tracer
- cat /sys/kernel/debug/tracing/trace_pipe
+ echo blk > /sys/kernel/tracing/current_tracer
+ cat /sys/kernel/tracing/trace_pipe
If unsure, say N.
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 5743be559415..d5d94510afd3 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -729,14 +729,10 @@ EXPORT_SYMBOL_GPL(blk_trace_startstop);
**/
int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
{
- struct request_queue *q;
+ struct request_queue *q = bdev_get_queue(bdev);
int ret, start = 0;
char b[BDEVNAME_SIZE];
- q = bdev_get_queue(bdev);
- if (!q)
- return -ENXIO;
-
mutex_lock(&q->debugfs_mutex);
switch (cmd) {
diff --git a/kernel/trace/kprobe_event_gen_test.c b/kernel/trace/kprobe_event_gen_test.c
index c736487fc0e4..4850fdfe27f1 100644
--- a/kernel/trace/kprobe_event_gen_test.c
+++ b/kernel/trace/kprobe_event_gen_test.c
@@ -21,7 +21,7 @@
* Then:
*
* # insmod kernel/trace/kprobe_event_gen_test.ko
- * # cat /sys/kernel/debug/tracing/trace
+ * # cat /sys/kernel/tracing/trace
*
* You should see many instances of the "gen_kprobe_test" and
* "gen_kretprobe_test" events in the trace buffer.
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c366a0a9ddba..af50d931b020 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1581,19 +1581,6 @@ static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
}
/**
- * rb_check_list - make sure a pointer to a list has the last bits zero
- */
-static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
- struct list_head *list)
-{
- if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev))
- return 1;
- if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next))
- return 1;
- return 0;
-}
-
-/**
* rb_check_pages - integrity check of buffer pages
* @cpu_buffer: CPU buffer with pages to test
*
@@ -1602,36 +1589,27 @@ static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
*/
static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
- struct list_head *head = cpu_buffer->pages;
- struct buffer_page *bpage, *tmp;
-
- /* Reset the head page if it exists */
- if (cpu_buffer->head_page)
- rb_set_head_page(cpu_buffer);
-
- rb_head_page_deactivate(cpu_buffer);
+ struct list_head *head = rb_list_head(cpu_buffer->pages);
+ struct list_head *tmp;
- if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
- return -1;
- if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
+ if (RB_WARN_ON(cpu_buffer,
+ rb_list_head(rb_list_head(head->next)->prev) != head))
return -1;
- if (rb_check_list(cpu_buffer, head))
+ if (RB_WARN_ON(cpu_buffer,
+ rb_list_head(rb_list_head(head->prev)->next) != head))
return -1;
- list_for_each_entry_safe(bpage, tmp, head, list) {
+ for (tmp = rb_list_head(head->next); tmp != head; tmp = rb_list_head(tmp->next)) {
if (RB_WARN_ON(cpu_buffer,
- bpage->list.next->prev != &bpage->list))
+ rb_list_head(rb_list_head(tmp->next)->prev) != tmp))
return -1;
+
if (RB_WARN_ON(cpu_buffer,
- bpage->list.prev->next != &bpage->list))
- return -1;
- if (rb_check_list(cpu_buffer, &bpage->list))
+ rb_list_head(rb_list_head(tmp->prev)->next) != tmp))
return -1;
}
- rb_head_page_activate(cpu_buffer);
-
return 0;
}
@@ -2886,7 +2864,7 @@ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
sched_clock_stable() ? "" :
"If you just came from a suspend/resume,\n"
"please switch to the trace global clock:\n"
- " echo global > /sys/kernel/debug/tracing/trace_clock\n"
+ " echo global > /sys/kernel/tracing/trace_clock\n"
"or add trace_clock=global to the kernel command line\n");
}
@@ -5626,11 +5604,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
*/
void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data)
{
- struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_data_page *bpage = data;
struct page *page = virt_to_page(bpage);
unsigned long flags;
+ if (!buffer || !buffer->buffers || !buffer->buffers[cpu])
+ return;
+
+ cpu_buffer = buffer->buffers[cpu];
+
/* If the page is still in use someplace else, we can't reuse it */
if (page_ref_count(page) > 1)
goto out;
diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c
index 8d77526892f4..8dfe85499d4a 100644
--- a/kernel/trace/synth_event_gen_test.c
+++ b/kernel/trace/synth_event_gen_test.c
@@ -22,7 +22,7 @@
* Then:
*
* # insmod kernel/trace/synth_event_gen_test.ko
- * # cat /sys/kernel/debug/tracing/trace
+ * # cat /sys/kernel/tracing/trace
*
* You should see several events in the trace buffer -
* "create_synth_test", "empty_synth_test", and several instances of
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 54a163ae4815..45551c7b4c36 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -49,6 +49,8 @@
#include <linux/irq_work.h>
#include <linux/workqueue.h>
+#include <asm/setup.h> /* COMMAND_LINE_SIZE */
+
#include "trace.h"
#include "trace_output.h"
@@ -186,6 +188,12 @@ static char *default_bootup_tracer;
static bool allocate_snapshot;
static bool snapshot_at_boot;
+static char boot_instance_info[COMMAND_LINE_SIZE] __initdata;
+static int boot_instance_index;
+
+static char boot_snapshot_info[COMMAND_LINE_SIZE] __initdata;
+static int boot_snapshot_index;
+
static int __init set_cmdline_ftrace(char *str)
{
strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
@@ -222,9 +230,22 @@ __setup("traceoff_on_warning", stop_trace_on_warning);
static int __init boot_alloc_snapshot(char *str)
{
- allocate_snapshot = true;
- /* We also need the main ring buffer expanded */
- ring_buffer_expanded = true;
+ char *slot = boot_snapshot_info + boot_snapshot_index;
+ int left = sizeof(boot_snapshot_info) - boot_snapshot_index;
+ int ret;
+
+ if (str[0] == '=') {
+ str++;
+ if (strlen(str) >= left)
+ return -1;
+
+ ret = snprintf(slot, left, "%s\t", str);
+ boot_snapshot_index += ret;
+ } else {
+ allocate_snapshot = true;
+ /* We also need the main ring buffer expanded */
+ ring_buffer_expanded = true;
+ }
return 1;
}
__setup("alloc_snapshot", boot_alloc_snapshot);
@@ -239,6 +260,23 @@ static int __init boot_snapshot(char *str)
__setup("ftrace_boot_snapshot", boot_snapshot);
+static int __init boot_instance(char *str)
+{
+ char *slot = boot_instance_info + boot_instance_index;
+ int left = sizeof(boot_instance_info) - boot_instance_index;
+ int ret;
+
+ if (strlen(str) >= left)
+ return -1;
+
+ ret = snprintf(slot, left, "%s\t", str);
+ boot_instance_index += ret;
+
+ return 1;
+}
+__setup("trace_instance=", boot_instance);
+
+
static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
static int __init set_trace_boot_options(char *str)
@@ -1001,13 +1039,8 @@ __buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *ev
ring_buffer_unlock_commit(buffer);
}
-/**
- * __trace_puts - write a constant string into the trace buffer.
- * @ip: The address of the caller
- * @str: The constant string to write
- * @size: The size of the string.
- */
-int __trace_puts(unsigned long ip, const char *str, int size)
+int __trace_array_puts(struct trace_array *tr, unsigned long ip,
+ const char *str, int size)
{
struct ring_buffer_event *event;
struct trace_buffer *buffer;
@@ -1015,7 +1048,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
unsigned int trace_ctx;
int alloc;
- if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
+ if (!(tr->trace_flags & TRACE_ITER_PRINTK))
return 0;
if (unlikely(tracing_selftest_running || tracing_disabled))
@@ -1024,7 +1057,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
alloc = sizeof(*entry) + size + 2; /* possible \n added */
trace_ctx = tracing_gen_ctx();
- buffer = global_trace.array_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
trace_ctx);
@@ -1046,11 +1079,23 @@ int __trace_puts(unsigned long ip, const char *str, int size)
entry->buf[size] = '\0';
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL);
+ ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL);
out:
ring_buffer_nest_end(buffer);
return size;
}
+EXPORT_SYMBOL_GPL(__trace_array_puts);
+
+/**
+ * __trace_puts - write a constant string into the trace buffer.
+ * @ip: The address of the caller
+ * @str: The constant string to write
+ * @size: The size of the string.
+ */
+int __trace_puts(unsigned long ip, const char *str, int size)
+{
+ return __trace_array_puts(&global_trace, ip, str, size);
+}
EXPORT_SYMBOL_GPL(__trace_puts);
/**
@@ -1142,7 +1187,7 @@ void tracing_snapshot_instance(struct trace_array *tr)
*
* Note, make sure to allocate the snapshot with either
* a tracing_snapshot_alloc(), or by doing it manually
- * with: echo 1 > /sys/kernel/debug/tracing/snapshot
+ * with: echo 1 > /sys/kernel/tracing/snapshot
*
* If the snapshot buffer is not allocated, it will stop tracing.
* Basically making a permanent snapshot.
@@ -5601,7 +5646,7 @@ static const char readme_msg[] =
#ifdef CONFIG_HIST_TRIGGERS
"\t s:[synthetic/]<event> <field> [<field>]\n"
#endif
- "\t e[:[<group>/][<event>]] <attached-group>.<attached-event> [<args>]\n"
+ "\t e[:[<group>/][<event>]] <attached-group>.<attached-event> [<args>] [if <filter>]\n"
"\t -:[<group>/][<event>]\n"
#ifdef CONFIG_KPROBE_EVENTS
"\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
@@ -5618,7 +5663,7 @@ static const char readme_msg[] =
"\t $stack<index>, $stack, $retval, $comm,\n"
#endif
"\t +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n"
- "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n"
+ "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n"
"\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n"
"\t symstr, <type>\\[<array-size>\\]\n"
#ifdef CONFIG_HIST_TRIGGERS
@@ -5760,7 +5805,7 @@ static const char readme_msg[] =
#ifdef CONFIG_SYNTH_EVENTS
" events/synthetic_events\t- Create/append/remove/show synthetic events\n"
"\t Write into this file to define/undefine new synthetic events.\n"
- "\t example: echo 'myevent u64 lat; char name[]' >> synthetic_events\n"
+ "\t example: echo 'myevent u64 lat; char name[]; long[] stack' >> synthetic_events\n"
#endif
#endif
;
@@ -9225,10 +9270,6 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
}
tr->allocated_snapshot = allocate_snapshot;
- /*
- * Only the top level trace array gets its snapshot allocated
- * from the kernel command line.
- */
allocate_snapshot = false;
#endif
@@ -10144,6 +10185,79 @@ out:
return ret;
}
+#ifdef CONFIG_TRACER_MAX_TRACE
+__init static bool tr_needs_alloc_snapshot(const char *name)
+{
+ char *test;
+ int len = strlen(name);
+ bool ret;
+
+ if (!boot_snapshot_index)
+ return false;
+
+ if (strncmp(name, boot_snapshot_info, len) == 0 &&
+ boot_snapshot_info[len] == '\t')
+ return true;
+
+ test = kmalloc(strlen(name) + 3, GFP_KERNEL);
+ if (!test)
+ return false;
+
+ sprintf(test, "\t%s\t", name);
+ ret = strstr(boot_snapshot_info, test) == NULL;
+ kfree(test);
+ return ret;
+}
+
+__init static void do_allocate_snapshot(const char *name)
+{
+ if (!tr_needs_alloc_snapshot(name))
+ return;
+
+ /*
+ * When allocate_snapshot is set, the next call to
+ * allocate_trace_buffers() (called by trace_array_get_by_name())
+ * will allocate the snapshot buffer. That will alse clear
+ * this flag.
+ */
+ allocate_snapshot = true;
+}
+#else
+static inline void do_allocate_snapshot(const char *name) { }
+#endif
+
+__init static void enable_instances(void)
+{
+ struct trace_array *tr;
+ char *curr_str;
+ char *str;
+ char *tok;
+
+ /* A tab is always appended */
+ boot_instance_info[boot_instance_index - 1] = '\0';
+ str = boot_instance_info;
+
+ while ((curr_str = strsep(&str, "\t"))) {
+
+ tok = strsep(&curr_str, ",");
+
+ if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE))
+ do_allocate_snapshot(tok);
+
+ tr = trace_array_get_by_name(tok);
+ if (!tr) {
+ pr_warn("Failed to create instance buffer %s\n", curr_str);
+ continue;
+ }
+ /* Allow user space to delete it */
+ trace_array_put(tr);
+
+ while ((tok = strsep(&curr_str, ","))) {
+ early_enable_events(tr, tok, true);
+ }
+ }
+}
+
__init static int tracer_alloc_buffers(void)
{
int ring_buf_size;
@@ -10277,10 +10391,19 @@ out:
void __init ftrace_boot_snapshot(void)
{
+ struct trace_array *tr;
+
if (snapshot_at_boot) {
tracing_snapshot();
internal_trace_puts("** Boot snapshot taken **\n");
}
+
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (tr == &global_trace)
+ continue;
+ trace_array_puts(tr, "** Boot snapshot taken **\n");
+ tracing_snapshot_instance(tr);
+ }
}
void __init early_trace_init(void)
@@ -10302,6 +10425,9 @@ void __init early_trace_init(void)
void __init trace_init(void)
{
trace_event_init();
+
+ if (boot_instance_index)
+ enable_instances();
}
__init static void clear_boot_tracer(void)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 085a31b978a5..616e1aa1c4da 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -25,7 +25,7 @@
#include "pid_list.h"
#ifdef CONFIG_FTRACE_SYSCALLS
-#include <asm/unistd.h> /* For NR_SYSCALLS */
+#include <asm/unistd.h> /* For NR_syscalls */
#include <asm/syscall.h> /* some archs define it here */
#endif
@@ -113,6 +113,10 @@ enum trace_type {
#define MEM_FAIL(condition, fmt, ...) \
DO_ONCE_LITE_IF(condition, pr_err, "ERROR: " fmt, ##__VA_ARGS__)
+#define HIST_STACKTRACE_DEPTH 16
+#define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long))
+#define HIST_STACKTRACE_SKIP 5
+
/*
* syscalls are special, and need special handling, this is why
* they are not included in trace_entries.h
@@ -1331,6 +1335,8 @@ DECLARE_PER_CPU(int, trace_buffered_event_cnt);
void trace_buffered_event_disable(void);
void trace_buffered_event_enable(void);
+void early_enable_events(struct trace_array *tr, char *buf, bool disable_first);
+
static inline void
__trace_event_discard_commit(struct trace_buffer *buffer,
struct ring_buffer_event *event)
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 352b65e2b910..67e854979d53 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -311,7 +311,7 @@ print_eprobe_event(struct trace_iterator *iter, int flags,
trace_seq_putc(s, ')');
- if (print_probe_args(s, tp->args, tp->nr_args,
+ if (trace_probe_print_args(s, tp->args, tp->nr_args,
(u8 *)&field[1], field) < 0)
goto out;
@@ -320,7 +320,8 @@ print_eprobe_event(struct trace_iterator *iter, int flags,
return trace_handle_return(s);
}
-static unsigned long get_event_field(struct fetch_insn *code, void *rec)
+static nokprobe_inline unsigned long
+get_event_field(struct fetch_insn *code, void *rec)
{
struct ftrace_event_field *field = code->data;
unsigned long val;
@@ -395,20 +396,12 @@ static int get_eprobe_size(struct trace_probe *tp, void *rec)
case FETCH_OP_TP_ARG:
val = get_event_field(code, rec);
break;
- case FETCH_OP_IMM:
- val = code->immediate;
- break;
- case FETCH_OP_COMM:
- val = (unsigned long)current->comm;
- break;
- case FETCH_OP_DATA:
- val = (unsigned long)code->data;
- break;
case FETCH_NOP_SYMBOL: /* Ignore a place holder */
code++;
goto retry;
default:
- continue;
+ if (process_common_fetch_insn(code, &val) < 0)
+ continue;
}
code++;
len = process_fetch_insn_bottom(code, val, NULL, NULL);
@@ -428,84 +421,26 @@ process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
void *base)
{
unsigned long val;
+ int ret;
retry:
switch (code->op) {
case FETCH_OP_TP_ARG:
val = get_event_field(code, rec);
break;
- case FETCH_OP_IMM:
- val = code->immediate;
- break;
- case FETCH_OP_COMM:
- val = (unsigned long)current->comm;
- break;
- case FETCH_OP_DATA:
- val = (unsigned long)code->data;
- break;
case FETCH_NOP_SYMBOL: /* Ignore a place holder */
code++;
goto retry;
default:
- return -EILSEQ;
+ ret = process_common_fetch_insn(code, &val);
+ if (ret < 0)
+ return ret;
}
code++;
return process_fetch_insn_bottom(code, val, dest, base);
}
NOKPROBE_SYMBOL(process_fetch_insn)
-/* Return the length of string -- including null terminal byte */
-static nokprobe_inline int
-fetch_store_strlen_user(unsigned long addr)
-{
- return kern_fetch_store_strlen_user(addr);
-}
-
-/* Return the length of string -- including null terminal byte */
-static nokprobe_inline int
-fetch_store_strlen(unsigned long addr)
-{
- return kern_fetch_store_strlen(addr);
-}
-
-/*
- * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf
- * with max length and relative data location.
- */
-static nokprobe_inline int
-fetch_store_string_user(unsigned long addr, void *dest, void *base)
-{
- return kern_fetch_store_string_user(addr, dest, base);
-}
-
-/*
- * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max
- * length and relative data location.
- */
-static nokprobe_inline int
-fetch_store_string(unsigned long addr, void *dest, void *base)
-{
- return kern_fetch_store_string(addr, dest, base);
-}
-
-static nokprobe_inline int
-probe_mem_read_user(void *dest, void *src, size_t size)
-{
- const void __user *uaddr = (__force const void __user *)src;
-
- return copy_from_user_nofault(dest, uaddr, size);
-}
-
-static nokprobe_inline int
-probe_mem_read(void *dest, void *src, size_t size)
-{
-#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
- if ((unsigned long)src < TASK_SIZE)
- return probe_mem_read_user(dest, src, size);
-#endif
- return copy_from_kernel_nofault(dest, src, size);
-}
-
/* eprobe handler */
static inline void
__eprobe_trace_func(struct eprobe_data *edata, void *rec)
@@ -923,17 +858,13 @@ static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const ch
p = ep->filter_str;
for (i = 0; i < argc; i++) {
- ret = snprintf(p, len, "%s ", argv[i]);
- if (ret < 0)
- goto error;
- if (ret > len) {
- ret = -E2BIG;
- goto error;
- }
+ if (i)
+ ret = snprintf(p, len, " %s", argv[i]);
+ else
+ ret = snprintf(p, len, "%s", argv[i]);
p += ret;
len -= ret;
}
- p[-1] = '\0';
/*
* Ensure the filter string can be parsed correctly. Note, this
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 6a942fa275c7..654ffa40457a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2281,8 +2281,6 @@ create_new_subsystem(const char *name)
if (!system->name)
goto out_free;
- system->filter = NULL;
-
system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
if (!system->filter)
goto out_free;
@@ -2843,7 +2841,7 @@ static __init int setup_trace_triggers(char *str)
if (!trigger)
break;
bootup_triggers[i].event = strsep(&trigger, ".");
- bootup_triggers[i].trigger = strsep(&trigger, ".");
+ bootup_triggers[i].trigger = trigger;
if (!bootup_triggers[i].trigger)
break;
}
@@ -3771,10 +3769,9 @@ static __init int event_trace_memsetup(void)
return 0;
}
-static __init void
-early_enable_events(struct trace_array *tr, bool disable_first)
+__init void
+early_enable_events(struct trace_array *tr, char *buf, bool disable_first)
{
- char *buf = bootup_event_buf;
char *token;
int ret;
@@ -3827,7 +3824,7 @@ static __init int event_trace_enable(void)
*/
__trace_early_add_events(tr);
- early_enable_events(tr, false);
+ early_enable_events(tr, bootup_event_buf, false);
trace_printk_start_comm();
@@ -3855,7 +3852,7 @@ static __init int event_trace_enable_again(void)
if (!tr)
return -ENODEV;
- early_enable_events(tr, true);
+ early_enable_events(tr, bootup_event_buf, true);
return 0;
}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e095c3b3a50d..1dad64267878 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -64,6 +64,7 @@ enum filter_pred_fn {
FILTER_PRED_FN_PCHAR_USER,
FILTER_PRED_FN_PCHAR,
FILTER_PRED_FN_CPU,
+ FILTER_PRED_FN_FUNCTION,
FILTER_PRED_FN_,
FILTER_PRED_TEST_VISITED,
};
@@ -71,6 +72,7 @@ enum filter_pred_fn {
struct filter_pred {
enum filter_pred_fn fn_num;
u64 val;
+ u64 val2;
struct regex regex;
unsigned short *ops;
struct ftrace_event_field *field;
@@ -103,6 +105,7 @@ struct filter_pred {
C(INVALID_FILTER, "Meaningless filter expression"), \
C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \
C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), \
+ C(NO_FUNCTION, "Function not found"), \
C(ERRNO, "Error"), \
C(NO_FILTER, "No filter found")
@@ -876,6 +879,17 @@ static int filter_pred_comm(struct filter_pred *pred, void *event)
return cmp ^ pred->not;
}
+/* Filter predicate for functions. */
+static int filter_pred_function(struct filter_pred *pred, void *event)
+{
+ unsigned long *addr = (unsigned long *)(event + pred->offset);
+ unsigned long start = (unsigned long)pred->val;
+ unsigned long end = (unsigned long)pred->val2;
+ int ret = *addr >= start && *addr < end;
+
+ return pred->op == OP_EQ ? ret : !ret;
+}
+
/*
* regex_match_foo - Basic regex callbacks
*
@@ -1335,6 +1349,8 @@ static int filter_pred_fn_call(struct filter_pred *pred, void *event)
return filter_pred_pchar(pred, event);
case FILTER_PRED_FN_CPU:
return filter_pred_cpu(pred, event);
+ case FILTER_PRED_FN_FUNCTION:
+ return filter_pred_function(pred, event);
case FILTER_PRED_TEST_VISITED:
return test_pred_visited_fn(pred, event);
default:
@@ -1350,8 +1366,13 @@ static int parse_pred(const char *str, void *data,
struct trace_event_call *call = data;
struct ftrace_event_field *field;
struct filter_pred *pred = NULL;
+ unsigned long offset;
+ unsigned long size;
+ unsigned long ip;
char num_buf[24]; /* Big enough to hold an address */
char *field_name;
+ char *name;
+ bool function = false;
bool ustring = false;
char q;
u64 val;
@@ -1393,6 +1414,12 @@ static int parse_pred(const char *str, void *data,
i += len;
}
+ /* See if the field is a kernel function name */
+ if ((len = str_has_prefix(str + i, ".function"))) {
+ function = true;
+ i += len;
+ }
+
while (isspace(str[i]))
i++;
@@ -1423,7 +1450,71 @@ static int parse_pred(const char *str, void *data,
pred->offset = field->offset;
pred->op = op;
- if (ftrace_event_is_function(call)) {
+ if (function) {
+ /* The field must be the same size as long */
+ if (field->size != sizeof(long)) {
+ parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i);
+ goto err_free;
+ }
+
+ /* Function only works with '==' or '!=' and an unquoted string */
+ switch (op) {
+ case OP_NE:
+ case OP_EQ:
+ break;
+ default:
+ parse_error(pe, FILT_ERR_INVALID_OP, pos + i);
+ goto err_free;
+ }
+
+ if (isdigit(str[i])) {
+ /* We allow 0xDEADBEEF */
+ while (isalnum(str[i]))
+ i++;
+
+ len = i - s;
+ /* 0xfeedfacedeadbeef is 18 chars max */
+ if (len >= sizeof(num_buf)) {
+ parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i);
+ goto err_free;
+ }
+
+ strncpy(num_buf, str + s, len);
+ num_buf[len] = 0;
+
+ ret = kstrtoul(num_buf, 0, &ip);
+ if (ret) {
+ parse_error(pe, FILT_ERR_INVALID_VALUE, pos + i);
+ goto err_free;
+ }
+ } else {
+ s = i;
+ for (; str[i] && !isspace(str[i]); i++)
+ ;
+
+ len = i - s;
+ name = kmemdup_nul(str + s, len, GFP_KERNEL);
+ if (!name)
+ goto err_mem;
+ ip = kallsyms_lookup_name(name);
+ kfree(name);
+ if (!ip) {
+ parse_error(pe, FILT_ERR_NO_FUNCTION, pos + i);
+ goto err_free;
+ }
+ }
+
+ /* Now find the function start and end address */
+ if (!kallsyms_lookup_size_offset(ip, &size, &offset)) {
+ parse_error(pe, FILT_ERR_NO_FUNCTION, pos + i);
+ goto err_free;
+ }
+
+ pred->fn_num = FILTER_PRED_FN_FUNCTION;
+ pred->val = ip - offset;
+ pred->val2 = pred->val + size;
+
+ } else if (ftrace_event_is_function(call)) {
/*
* Perf does things different with function events.
* It only allows an "ip" field, and expects a string.
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 5edbf6b1da3f..89877a18f933 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -135,6 +135,7 @@ enum hist_field_fn {
HIST_FIELD_FN_DIV_NOT_POWER2,
HIST_FIELD_FN_DIV_MULT_SHIFT,
HIST_FIELD_FN_EXECNAME,
+ HIST_FIELD_FN_STACK,
};
/*
@@ -480,10 +481,6 @@ DEFINE_HIST_FIELD_FN(u8);
#define for_each_hist_key_field(i, hist_data) \
for ((i) = (hist_data)->n_vals; (i) < (hist_data)->n_fields; (i)++)
-#define HIST_STACKTRACE_DEPTH 16
-#define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long))
-#define HIST_STACKTRACE_SKIP 5
-
#define HITCOUNT_IDX 0
#define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE)
@@ -1360,7 +1357,12 @@ static const char *hist_field_name(struct hist_field *field,
field_name = field->name;
} else if (field->flags & HIST_FIELD_FL_TIMESTAMP)
field_name = "common_timestamp";
- else if (field->flags & HIST_FIELD_FL_HITCOUNT)
+ else if (field->flags & HIST_FIELD_FL_STACKTRACE) {
+ if (field->field)
+ field_name = field->field->name;
+ else
+ field_name = "stacktrace";
+ } else if (field->flags & HIST_FIELD_FL_HITCOUNT)
field_name = "hitcount";
if (field_name == NULL)
@@ -1718,6 +1720,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field)
flags_str = "percent";
else if (hist_field->flags & HIST_FIELD_FL_GRAPH)
flags_str = "graph";
+ else if (hist_field->flags & HIST_FIELD_FL_STACKTRACE)
+ flags_str = "stacktrace";
return flags_str;
}
@@ -1979,7 +1983,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
}
if (flags & HIST_FIELD_FL_STACKTRACE) {
- hist_field->fn_num = HIST_FIELD_FN_NOP;
+ if (field)
+ hist_field->fn_num = HIST_FIELD_FN_STACK;
+ else
+ hist_field->fn_num = HIST_FIELD_FN_NOP;
+ hist_field->size = HIST_STACKTRACE_SIZE;
+ hist_field->type = kstrdup_const("unsigned long[]", GFP_KERNEL);
+ if (!hist_field->type)
+ goto free;
goto out;
}
@@ -2312,6 +2323,8 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
*flags |= HIST_FIELD_FL_EXECNAME;
else if (strcmp(modifier, "syscall") == 0)
*flags |= HIST_FIELD_FL_SYSCALL;
+ else if (strcmp(modifier, "stacktrace") == 0)
+ *flags |= HIST_FIELD_FL_STACKTRACE;
else if (strcmp(modifier, "log2") == 0)
*flags |= HIST_FIELD_FL_LOG2;
else if (strcmp(modifier, "usecs") == 0)
@@ -2351,6 +2364,8 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
hist_data->enable_timestamps = true;
if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS)
hist_data->attrs->ts_in_usecs = true;
+ } else if (strcmp(field_name, "stacktrace") == 0) {
+ *flags |= HIST_FIELD_FL_STACKTRACE;
} else if (strcmp(field_name, "common_cpu") == 0)
*flags |= HIST_FIELD_FL_CPU;
else if (strcmp(field_name, "hitcount") == 0)
@@ -3111,6 +3126,9 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,
unsigned int i, j, var_idx;
u64 var_val;
+ /* Make sure stacktrace can fit in the string variable length */
+ BUILD_BUG_ON((HIST_STACKTRACE_DEPTH + 1) * sizeof(long) >= STR_VAR_LEN_MAX);
+
for (i = 0, j = field_var_str_start; i < n_field_vars; i++) {
struct field_var *field_var = field_vars[i];
struct hist_field *var = field_var->var;
@@ -3119,13 +3137,26 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,
var_val = hist_fn_call(val, elt, buffer, rbe, rec);
var_idx = var->var.idx;
- if (val->flags & HIST_FIELD_FL_STRING) {
+ if (val->flags & (HIST_FIELD_FL_STRING |
+ HIST_FIELD_FL_STACKTRACE)) {
char *str = elt_data->field_var_str[j++];
char *val_str = (char *)(uintptr_t)var_val;
unsigned int size;
- size = min(val->size, STR_VAR_LEN_MAX);
- strscpy(str, val_str, size);
+ if (val->flags & HIST_FIELD_FL_STRING) {
+ size = min(val->size, STR_VAR_LEN_MAX);
+ strscpy(str, val_str, size);
+ } else {
+ char *stack_start = str + sizeof(unsigned long);
+ int e;
+
+ e = stack_trace_save((void *)stack_start,
+ HIST_STACKTRACE_DEPTH,
+ HIST_STACKTRACE_SKIP);
+ if (e < HIST_STACKTRACE_DEPTH - 1)
+ ((unsigned long *)stack_start)[e] = 0;
+ *((unsigned long *)str) = e;
+ }
var_val = (u64)(uintptr_t)str;
}
tracing_map_set_var(elt, var_idx, var_val);
@@ -3824,7 +3855,8 @@ static void save_field_var(struct hist_trigger_data *hist_data,
{
hist_data->field_vars[hist_data->n_field_vars++] = field_var;
- if (field_var->val->flags & HIST_FIELD_FL_STRING)
+ /* Stack traces are saved in the string storage too */
+ if (field_var->val->flags & (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE))
hist_data->n_field_var_str++;
}
@@ -3849,6 +3881,9 @@ static int check_synth_field(struct synth_event *event,
&& field->is_dynamic)
return 0;
+ if (strstr(hist_field->type, "long[") && field->is_stack)
+ return 0;
+
if (strcmp(field->type, hist_field->type) != 0) {
if (field->size != hist_field->size ||
(!field->is_string && field->is_signed != hist_field->is_signed))
@@ -4103,7 +4138,8 @@ static int action_create(struct hist_trigger_data *hist_data,
}
hist_data->save_vars[hist_data->n_save_vars++] = field_var;
- if (field_var->val->flags & HIST_FIELD_FL_STRING)
+ if (field_var->val->flags &
+ (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE))
hist_data->n_save_var_str++;
kfree(param);
}
@@ -4242,6 +4278,19 @@ static u64 hist_field_execname(struct hist_field *hist_field,
return (u64)(unsigned long)(elt_data->comm);
}
+static u64 hist_field_stack(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ u32 str_item = *(u32 *)(event + hist_field->field->offset);
+ int str_loc = str_item & 0xffff;
+ char *addr = (char *)(event + str_loc);
+
+ return (u64)(unsigned long)addr;
+}
+
static u64 hist_fn_call(struct hist_field *hist_field,
struct tracing_map_elt *elt,
struct trace_buffer *buffer,
@@ -4305,6 +4354,8 @@ static u64 hist_fn_call(struct hist_field *hist_field,
return div_by_mult_and_shift(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_EXECNAME:
return hist_field_execname(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_STACK:
+ return hist_field_stack(hist_field, elt, buffer, rbe, event);
default:
return 0;
}
@@ -4351,7 +4402,8 @@ static int create_var_field(struct hist_trigger_data *hist_data,
if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_EXECNAME)
update_var_execname(hist_data->fields[val_idx]);
- if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_STRING)
+ if (!ret && hist_data->fields[val_idx]->flags &
+ (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE))
hist_data->fields[val_idx]->var_str_idx = hist_data->n_var_str++;
return ret;
@@ -5092,7 +5144,8 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
if (hist_field->flags & HIST_FIELD_FL_VAR) {
var_idx = hist_field->var.idx;
- if (hist_field->flags & HIST_FIELD_FL_STRING) {
+ if (hist_field->flags &
+ (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE)) {
unsigned int str_start, var_str_idx, idx;
char *str, *val_str;
unsigned int size;
@@ -5105,9 +5158,20 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
str = elt_data->field_var_str[idx];
val_str = (char *)(uintptr_t)hist_val;
- size = min(hist_field->size, STR_VAR_LEN_MAX);
- strscpy(str, val_str, size);
-
+ if (hist_field->flags & HIST_FIELD_FL_STRING) {
+ size = min(hist_field->size, STR_VAR_LEN_MAX);
+ strscpy(str, val_str, size);
+ } else {
+ char *stack_start = str + sizeof(unsigned long);
+ int e;
+
+ e = stack_trace_save((void *)stack_start,
+ HIST_STACKTRACE_DEPTH,
+ HIST_STACKTRACE_SKIP);
+ if (e < HIST_STACKTRACE_DEPTH - 1)
+ ((unsigned long *)stack_start)[e] = 0;
+ *((unsigned long *)str) = e;
+ }
hist_val = (u64)(uintptr_t)str;
}
tracing_map_set_var(elt, var_idx, hist_val);
@@ -5193,8 +5257,17 @@ static void event_hist_trigger(struct event_trigger_data *data,
if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
memset(entries, 0, HIST_STACKTRACE_SIZE);
- stack_trace_save(entries, HIST_STACKTRACE_DEPTH,
- HIST_STACKTRACE_SKIP);
+ if (key_field->field) {
+ unsigned long *stack, n_entries;
+
+ field_contents = hist_fn_call(key_field, elt, buffer, rbe, rec);
+ stack = (unsigned long *)(long)field_contents;
+ n_entries = *stack;
+ memcpy(entries, ++stack, n_entries * sizeof(unsigned long));
+ } else {
+ stack_trace_save(entries, HIST_STACKTRACE_DEPTH,
+ HIST_STACKTRACE_SKIP);
+ }
key = entries;
} else {
field_contents = hist_fn_call(key_field, elt, buffer, rbe, rec);
@@ -5297,7 +5370,10 @@ static void hist_trigger_print_key(struct seq_file *m,
seq_printf(m, "%s: %-30s[%3llu]", field_name,
syscall_name, uval);
} else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
- seq_puts(m, "stacktrace:\n");
+ if (key_field->field)
+ seq_printf(m, "%s.stacktrace", key_field->field->name);
+ else
+ seq_puts(m, "stacktrace:\n");
hist_trigger_stacktrace_print(m,
key + key_field->offset,
HIST_STACKTRACE_DEPTH);
@@ -5842,7 +5918,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
if (hist_field->flags) {
if (!(hist_field->flags & HIST_FIELD_FL_VAR_REF) &&
- !(hist_field->flags & HIST_FIELD_FL_EXPR)) {
+ !(hist_field->flags & HIST_FIELD_FL_EXPR) &&
+ !(hist_field->flags & HIST_FIELD_FL_STACKTRACE)) {
const char *flags = get_hist_field_flags(hist_field);
if (flags)
@@ -5875,9 +5952,12 @@ static int event_hist_trigger_print(struct seq_file *m,
if (i > hist_data->n_vals)
seq_puts(m, ",");
- if (field->flags & HIST_FIELD_FL_STACKTRACE)
- seq_puts(m, "stacktrace");
- else
+ if (field->flags & HIST_FIELD_FL_STACKTRACE) {
+ if (field->field)
+ seq_printf(m, "%s.stacktrace", field->field->name);
+ else
+ seq_puts(m, "stacktrace");
+ } else
hist_field_print(m, field);
}
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 67592eed0be8..46d0abb32d0f 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -173,6 +173,14 @@ static int synth_field_is_string(char *type)
return false;
}
+static int synth_field_is_stack(char *type)
+{
+ if (strstr(type, "long[") != NULL)
+ return true;
+
+ return false;
+}
+
static int synth_field_string_size(char *type)
{
char buf[4], *end, *start;
@@ -248,6 +256,8 @@ static int synth_field_size(char *type)
size = sizeof(gfp_t);
else if (synth_field_is_string(type))
size = synth_field_string_size(type);
+ else if (synth_field_is_stack(type))
+ size = 0;
return size;
}
@@ -292,6 +302,8 @@ static const char *synth_field_fmt(char *type)
fmt = "%x";
else if (synth_field_is_string(type))
fmt = "%.*s";
+ else if (synth_field_is_stack(type))
+ fmt = "%s";
return fmt;
}
@@ -371,6 +383,23 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
i == se->n_fields - 1 ? "" : " ");
n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
}
+ } else if (se->fields[i]->is_stack) {
+ u32 offset, data_offset, len;
+ unsigned long *p, *end;
+
+ offset = (u32)entry->fields[n_u64];
+ data_offset = offset & 0xffff;
+ len = offset >> 16;
+
+ p = (void *)entry + data_offset;
+ end = (void *)p + len - (sizeof(long) - 1);
+
+ trace_seq_printf(s, "%s=STACK:\n", se->fields[i]->name);
+
+ for (; *p && p < end; p++)
+ trace_seq_printf(s, "=> %pS\n", (void *)*p);
+ n_u64++;
+
} else {
struct trace_print_flags __flags[] = {
__def_gfpflag_names, {-1, NULL} };
@@ -416,16 +445,15 @@ static unsigned int trace_string(struct synth_trace_event *entry,
if (is_dynamic) {
u32 data_offset;
- data_offset = offsetof(typeof(*entry), fields);
- data_offset += event->n_u64 * sizeof(u64);
+ data_offset = struct_size(entry, fields, event->n_u64);
data_offset += data_size;
- len = kern_fetch_store_strlen((unsigned long)str_val);
+ len = fetch_store_strlen((unsigned long)str_val);
data_offset |= len << 16;
*(u32 *)&entry->fields[*n_u64] = data_offset;
- ret = kern_fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry);
+ ret = fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry);
(*n_u64)++;
} else {
@@ -447,6 +475,43 @@ static unsigned int trace_string(struct synth_trace_event *entry,
return len;
}
+static unsigned int trace_stack(struct synth_trace_event *entry,
+ struct synth_event *event,
+ long *stack,
+ unsigned int data_size,
+ unsigned int *n_u64)
+{
+ unsigned int len;
+ u32 data_offset;
+ void *data_loc;
+
+ data_offset = struct_size(entry, fields, event->n_u64);
+ data_offset += data_size;
+
+ for (len = 0; len < HIST_STACKTRACE_DEPTH; len++) {
+ if (!stack[len])
+ break;
+ }
+
+ /* Include the zero'd element if it fits */
+ if (len < HIST_STACKTRACE_DEPTH)
+ len++;
+
+ len *= sizeof(long);
+
+ /* Find the dynamic section to copy the stack into. */
+ data_loc = (void *)entry + data_offset;
+ memcpy(data_loc, stack, len);
+
+ /* Fill in the field that holds the offset/len combo */
+ data_offset |= len << 16;
+ *(u32 *)&entry->fields[*n_u64] = data_offset;
+
+ (*n_u64)++;
+
+ return len;
+}
+
static notrace void trace_event_raw_event_synth(void *__data,
u64 *var_ref_vals,
unsigned int *var_ref_idx)
@@ -473,7 +538,12 @@ static notrace void trace_event_raw_event_synth(void *__data,
val_idx = var_ref_idx[field_pos];
str_val = (char *)(long)var_ref_vals[val_idx];
- len = kern_fetch_store_strlen((unsigned long)str_val);
+ if (event->dynamic_fields[i]->is_stack) {
+ len = *((unsigned long *)str_val);
+ len *= sizeof(unsigned long);
+ } else {
+ len = fetch_store_strlen((unsigned long)str_val);
+ }
fields_size += len;
}
@@ -499,6 +569,12 @@ static notrace void trace_event_raw_event_synth(void *__data,
event->fields[i]->is_dynamic,
data_size, &n_u64);
data_size += len; /* only dynamic string increments */
+ } else if (event->fields[i]->is_stack) {
+ long *stack = (long *)(long)var_ref_vals[val_idx];
+
+ len = trace_stack(entry, event, stack,
+ data_size, &n_u64);
+ data_size += len;
} else {
struct synth_field *field = event->fields[i];
u64 val = var_ref_vals[val_idx];
@@ -561,6 +637,9 @@ static int __set_synth_event_print_fmt(struct synth_event *event,
event->fields[i]->is_dynamic)
pos += snprintf(buf + pos, LEN_OR_ZERO,
", __get_str(%s)", event->fields[i]->name);
+ else if (event->fields[i]->is_stack)
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", __get_stacktrace(%s)", event->fields[i]->name);
else
pos += snprintf(buf + pos, LEN_OR_ZERO,
", REC->%s", event->fields[i]->name);
@@ -697,7 +776,8 @@ static struct synth_field *parse_synth_field(int argc, char **argv,
ret = -EINVAL;
goto free;
} else if (size == 0) {
- if (synth_field_is_string(field->type)) {
+ if (synth_field_is_string(field->type) ||
+ synth_field_is_stack(field->type)) {
char *type;
len = sizeof("__data_loc ") + strlen(field->type) + 1;
@@ -728,6 +808,8 @@ static struct synth_field *parse_synth_field(int argc, char **argv,
if (synth_field_is_string(field->type))
field->is_string = true;
+ else if (synth_field_is_stack(field->type))
+ field->is_stack = true;
field->is_signed = synth_field_signed(field->type);
out:
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index ee77c8203bd5..59cda19a9033 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1218,60 +1218,6 @@ static const struct file_operations kprobe_profile_ops = {
.release = seq_release,
};
-/* Kprobe specific fetch functions */
-
-/* Return the length of string -- including null terminal byte */
-static nokprobe_inline int
-fetch_store_strlen_user(unsigned long addr)
-{
- return kern_fetch_store_strlen_user(addr);
-}
-
-/* Return the length of string -- including null terminal byte */
-static nokprobe_inline int
-fetch_store_strlen(unsigned long addr)
-{
- return kern_fetch_store_strlen(addr);
-}
-
-/*
- * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf
- * with max length and relative data location.
- */
-static nokprobe_inline int
-fetch_store_string_user(unsigned long addr, void *dest, void *base)
-{
- return kern_fetch_store_string_user(addr, dest, base);
-}
-
-/*
- * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max
- * length and relative data location.
- */
-static nokprobe_inline int
-fetch_store_string(unsigned long addr, void *dest, void *base)
-{
- return kern_fetch_store_string(addr, dest, base);
-}
-
-static nokprobe_inline int
-probe_mem_read_user(void *dest, void *src, size_t size)
-{
- const void __user *uaddr = (__force const void __user *)src;
-
- return copy_from_user_nofault(dest, uaddr, size);
-}
-
-static nokprobe_inline int
-probe_mem_read(void *dest, void *src, size_t size)
-{
-#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
- if ((unsigned long)src < TASK_SIZE)
- return probe_mem_read_user(dest, src, size);
-#endif
- return copy_from_kernel_nofault(dest, src, size);
-}
-
/* Note that we don't verify it, since the code does not come from user space */
static int
process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
@@ -1279,6 +1225,7 @@ process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
{
struct pt_regs *regs = rec;
unsigned long val;
+ int ret;
retry:
/* 1st stage: get value from context */
@@ -1295,15 +1242,6 @@ retry:
case FETCH_OP_RETVAL:
val = regs_return_value(regs);
break;
- case FETCH_OP_IMM:
- val = code->immediate;
- break;
- case FETCH_OP_COMM:
- val = (unsigned long)current->comm;
- break;
- case FETCH_OP_DATA:
- val = (unsigned long)code->data;
- break;
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
case FETCH_OP_ARG:
val = regs_get_kernel_argument(regs, code->param);
@@ -1313,7 +1251,9 @@ retry:
code++;
goto retry;
default:
- return -EILSEQ;
+ ret = process_common_fetch_insn(code, &val);
+ if (ret < 0)
+ return ret;
}
code++;
@@ -1424,7 +1364,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
trace_seq_putc(s, ')');
- if (print_probe_args(s, tp->args, tp->nr_args,
+ if (trace_probe_print_args(s, tp->args, tp->nr_args,
(u8 *)&field[1], field) < 0)
goto out;
@@ -1459,7 +1399,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
trace_seq_putc(s, ')');
- if (print_probe_args(s, tp->args, tp->nr_args,
+ if (trace_probe_print_args(s, tp->args, tp->nr_args,
(u8 *)&field[1], field) < 0)
goto out;
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 210e1f168392..04f0fdae19a1 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -1539,7 +1539,7 @@ static void osnoise_sleep(void)
wake_time = ktime_add_us(ktime_get(), interval);
__set_current_state(TASK_INTERRUPTIBLE);
- while (schedule_hrtimeout_range(&wake_time, 0, HRTIMER_MODE_ABS)) {
+ while (schedule_hrtimeout(&wake_time, HRTIMER_MODE_ABS)) {
if (kthread_should_stop())
break;
}
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 01ebabbbe8c9..20d0c4a97633 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -50,6 +50,7 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(x8, u8, "0x%x")
DEFINE_BASIC_PRINT_TYPE_FUNC(x16, u16, "0x%x")
DEFINE_BASIC_PRINT_TYPE_FUNC(x32, u32, "0x%x")
DEFINE_BASIC_PRINT_TYPE_FUNC(x64, u64, "0x%Lx")
+DEFINE_BASIC_PRINT_TYPE_FUNC(char, u8, "'%c'")
int PRINT_TYPE_FUNC_NAME(symbol)(struct trace_seq *s, void *data, void *ent)
{
@@ -95,6 +96,7 @@ static const struct fetch_type probe_fetch_types[] = {
ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0),
ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0),
ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0),
+ ASSIGN_FETCH_TYPE_ALIAS(char, u8, u8, 0),
ASSIGN_FETCH_TYPE_ALIAS(symbol, ADDR_FETCH_TYPE, ADDR_FETCH_TYPE, 0),
ASSIGN_FETCH_TYPE_END
@@ -1237,3 +1239,30 @@ int trace_probe_create(const char *raw_command, int (*createfn)(int, const char
return ret;
}
+
+int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_args,
+ u8 *data, void *field)
+{
+ void *p;
+ int i, j;
+
+ for (i = 0; i < nr_args; i++) {
+ struct probe_arg *a = args + i;
+
+ trace_seq_printf(s, " %s=", a->name);
+ if (likely(!a->count)) {
+ if (!a->type->print(s, data + a->offset, field))
+ return -ENOMEM;
+ continue;
+ }
+ trace_seq_putc(s, '{');
+ p = data + a->offset;
+ for (j = 0; j < a->count; j++) {
+ if (!a->type->print(s, p, field))
+ return -ENOMEM;
+ trace_seq_putc(s, j == a->count - 1 ? '}' : ',');
+ p += a->type->size;
+ }
+ }
+ return 0;
+}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 23acfd1c3812..ef8ed3b65d05 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -166,6 +166,7 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(x16);
DECLARE_BASIC_PRINT_TYPE_FUNC(x32);
DECLARE_BASIC_PRINT_TYPE_FUNC(x64);
+DECLARE_BASIC_PRINT_TYPE_FUNC(char);
DECLARE_BASIC_PRINT_TYPE_FUNC(string);
DECLARE_BASIC_PRINT_TYPE_FUNC(symbol);
@@ -348,6 +349,8 @@ int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b);
bool trace_probe_match_command_args(struct trace_probe *tp,
int argc, const char **argv);
int trace_probe_create(const char *raw_command, int (*createfn)(int, const char **));
+int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_args,
+ u8 *data, void *field);
#define trace_probe_for_each_link(pos, tp) \
list_for_each_entry(pos, &(tp)->event->files, list)
diff --git a/kernel/trace/trace_probe_kernel.h b/kernel/trace/trace_probe_kernel.h
index 77dbd9ff9782..c4e1d4c03a85 100644
--- a/kernel/trace/trace_probe_kernel.h
+++ b/kernel/trace/trace_probe_kernel.h
@@ -12,7 +12,7 @@
*/
/* Return the length of string -- including null terminal byte */
static nokprobe_inline int
-kern_fetch_store_strlen_user(unsigned long addr)
+fetch_store_strlen_user(unsigned long addr)
{
const void __user *uaddr = (__force const void __user *)addr;
int ret;
@@ -29,14 +29,14 @@ kern_fetch_store_strlen_user(unsigned long addr)
/* Return the length of string -- including null terminal byte */
static nokprobe_inline int
-kern_fetch_store_strlen(unsigned long addr)
+fetch_store_strlen(unsigned long addr)
{
int ret, len = 0;
u8 c;
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
if (addr < TASK_SIZE)
- return kern_fetch_store_strlen_user(addr);
+ return fetch_store_strlen_user(addr);
#endif
do {
@@ -63,7 +63,7 @@ static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void
* with max length and relative data location.
*/
static nokprobe_inline int
-kern_fetch_store_string_user(unsigned long addr, void *dest, void *base)
+fetch_store_string_user(unsigned long addr, void *dest, void *base)
{
const void __user *uaddr = (__force const void __user *)addr;
int maxlen = get_loc_len(*(u32 *)dest);
@@ -86,7 +86,7 @@ kern_fetch_store_string_user(unsigned long addr, void *dest, void *base)
* length and relative data location.
*/
static nokprobe_inline int
-kern_fetch_store_string(unsigned long addr, void *dest, void *base)
+fetch_store_string(unsigned long addr, void *dest, void *base)
{
int maxlen = get_loc_len(*(u32 *)dest);
void *__dest;
@@ -94,7 +94,7 @@ kern_fetch_store_string(unsigned long addr, void *dest, void *base)
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
if ((unsigned long)addr < TASK_SIZE)
- return kern_fetch_store_string_user(addr, dest, base);
+ return fetch_store_string_user(addr, dest, base);
#endif
if (unlikely(!maxlen))
@@ -112,4 +112,22 @@ kern_fetch_store_string(unsigned long addr, void *dest, void *base)
return ret;
}
+static nokprobe_inline int
+probe_mem_read_user(void *dest, void *src, size_t size)
+{
+ const void __user *uaddr = (__force const void __user *)src;
+
+ return copy_from_user_nofault(dest, uaddr, size);
+}
+
+static nokprobe_inline int
+probe_mem_read(void *dest, void *src, size_t size)
+{
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)src < TASK_SIZE)
+ return probe_mem_read_user(dest, src, size);
+#endif
+ return copy_from_kernel_nofault(dest, src, size);
+}
+
#endif /* __TRACE_PROBE_KERNEL_H_ */
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index 5cea672243f6..00707630788d 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -98,6 +98,26 @@ fetch_store_symstring(unsigned long addr, void *dest, void *base)
return sprint_symbol(__dest, addr);
}
+/* common part of process_fetch_insn*/
+static nokprobe_inline int
+process_common_fetch_insn(struct fetch_insn *code, unsigned long *val)
+{
+ switch (code->op) {
+ case FETCH_OP_IMM:
+ *val = code->immediate;
+ break;
+ case FETCH_OP_COMM:
+ *val = (unsigned long)current->comm;
+ break;
+ case FETCH_OP_DATA:
+ *val = (unsigned long)code->data;
+ break;
+ default:
+ return -EILSEQ;
+ }
+ return 0;
+}
+
/* From the 2nd stage, routine is same */
static nokprobe_inline int
process_fetch_insn_bottom(struct fetch_insn *code, unsigned long val,
@@ -253,31 +273,3 @@ store_trace_args(void *data, struct trace_probe *tp, void *rec,
}
}
}
-
-static inline int
-print_probe_args(struct trace_seq *s, struct probe_arg *args, int nr_args,
- u8 *data, void *field)
-{
- void *p;
- int i, j;
-
- for (i = 0; i < nr_args; i++) {
- struct probe_arg *a = args + i;
-
- trace_seq_printf(s, " %s=", a->name);
- if (likely(!a->count)) {
- if (!a->type->print(s, data + a->offset, field))
- return -ENOMEM;
- continue;
- }
- trace_seq_putc(s, '{');
- p = data + a->offset;
- for (j = 0; j < a->count; j++) {
- if (!a->type->print(s, p, field))
- return -ENOMEM;
- trace_seq_putc(s, j == a->count - 1 ? '}' : ',');
- p += a->type->size;
- }
- }
- return 0;
-}
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index 9c90b3a7dce2..e5e299260d0c 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -403,3 +403,26 @@ int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str,
return 1;
}
EXPORT_SYMBOL(trace_seq_hex_dump);
+
+/*
+ * trace_seq_acquire - acquire seq buffer with size len
+ * @s: trace sequence descriptor
+ * @len: size of buffer to be acquired
+ *
+ * acquire buffer with size of @len from trace_seq for output usage,
+ * user can fill string into that buffer.
+ *
+ * Returns start address of acquired buffer.
+ *
+ * it allow multiple usage in one trace output function call.
+ */
+char *trace_seq_acquire(struct trace_seq *s, unsigned int len)
+{
+ char *ret = trace_seq_buffer_ptr(s);
+
+ if (!WARN_ON_ONCE(seq_buf_buffer_left(&s->seq) < len))
+ seq_buf_commit(&s->seq, len);
+
+ return ret;
+}
+EXPORT_SYMBOL(trace_seq_acquire);
diff --git a/kernel/trace/trace_synth.h b/kernel/trace/trace_synth.h
index b29595fe3ac5..43f6fb6078db 100644
--- a/kernel/trace/trace_synth.h
+++ b/kernel/trace/trace_synth.h
@@ -18,6 +18,7 @@ struct synth_field {
bool is_signed;
bool is_string;
bool is_dynamic;
+ bool is_stack;
};
struct synth_event {
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 8d64b6553aed..8b92e34ff0c8 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -220,6 +220,7 @@ process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
{
struct pt_regs *regs = rec;
unsigned long val;
+ int ret;
/* 1st stage: get value from context */
switch (code->op) {
@@ -235,20 +236,16 @@ process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
case FETCH_OP_RETVAL:
val = regs_return_value(regs);
break;
- case FETCH_OP_IMM:
- val = code->immediate;
- break;
case FETCH_OP_COMM:
val = FETCH_TOKEN_COMM;
break;
- case FETCH_OP_DATA:
- val = (unsigned long)code->data;
- break;
case FETCH_OP_FOFFS:
val = translate_user_vaddr(code->immediate);
break;
default:
- return -EILSEQ;
+ ret = process_common_fetch_insn(code, &val);
+ if (ret < 0)
+ return ret;
}
code++;
@@ -1042,7 +1039,7 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
data = DATAOF_TRACE_ENTRY(entry, false);
}
- if (print_probe_args(s, tu->tp.args, tu->tp.nr_args, data, entry) < 0)
+ if (trace_probe_print_args(s, tu->tp.args, tu->tp.nr_args, data, entry) < 0)
goto out;
trace_seq_putc(s, '\n');
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index f23144af5743..8d1507dd0724 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -571,8 +571,8 @@ static void for_each_tracepoint_range(
bool trace_module_has_bad_taint(struct module *mod)
{
return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP) |
- (1 << TAINT_UNSIGNED_MODULE) |
- (1 << TAINT_TEST));
+ (1 << TAINT_UNSIGNED_MODULE) | (1 << TAINT_TEST) |
+ (1 << TAINT_LIVEPATCH));
}
static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list);
diff --git a/kernel/umh.c b/kernel/umh.c
index fbf872c624cb..2a4708277335 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -501,9 +501,9 @@ static int proc_cap_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
- unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
- kernel_cap_t new_cap;
- int err, i;
+ unsigned long cap_array[2];
+ kernel_cap_t new_cap, *cap;
+ int err;
if (write && (!capable(CAP_SETPCAP) ||
!capable(CAP_SYS_MODULE)))
@@ -514,14 +514,16 @@ static int proc_cap_handler(struct ctl_table *table, int write,
* userspace if this is a read.
*/
spin_lock(&umh_sysctl_lock);
- for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) {
- if (table->data == CAP_BSET)
- cap_array[i] = usermodehelper_bset.cap[i];
- else if (table->data == CAP_PI)
- cap_array[i] = usermodehelper_inheritable.cap[i];
- else
- BUG();
- }
+ if (table->data == CAP_BSET)
+ cap = &usermodehelper_bset;
+ else if (table->data == CAP_PI)
+ cap = &usermodehelper_inheritable;
+ else
+ BUG();
+
+ /* Legacy format: capabilities are exposed as two 32-bit values */
+ cap_array[0] = (u32) cap->val;
+ cap_array[1] = cap->val >> 32;
spin_unlock(&umh_sysctl_lock);
t = *table;
@@ -535,22 +537,15 @@ static int proc_cap_handler(struct ctl_table *table, int write,
if (err < 0)
return err;
- /*
- * convert from the sysctl array of ulongs to the kernel_cap_t
- * internal representation
- */
- for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
- new_cap.cap[i] = cap_array[i];
+ new_cap.val = (u32)cap_array[0];
+ new_cap.val += (u64)cap_array[1] << 32;
/*
* Drop everything not in the new_cap (but don't add things)
*/
if (write) {
spin_lock(&umh_sysctl_lock);
- if (table->data == CAP_BSET)
- usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
- if (table->data == CAP_PI)
- usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
+ *cap = cap_intersect(*cap, new_cap);
spin_unlock(&umh_sysctl_lock);
}
@@ -561,14 +556,14 @@ struct ctl_table usermodehelper_table[] = {
{
.procname = "bset",
.data = CAP_BSET,
- .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+ .maxlen = 2 * sizeof(unsigned long),
.mode = 0600,
.proc_handler = proc_cap_handler,
},
{
.procname = "inheritable",
.data = CAP_PI,
- .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+ .maxlen = 2 * sizeof(unsigned long),
.mode = 0600,
.proc_handler = proc_cap_handler,
},
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 54211dbd516c..1d8e47bed3f1 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -229,7 +229,7 @@ void __put_user_ns(struct user_namespace *ns)
EXPORT_SYMBOL(__put_user_ns);
/**
- * idmap_key struct holds the information necessary to find an idmapping in a
+ * struct idmap_key - holds the information necessary to find an idmapping in a
* sorted idmap array. It is passed to cmp_map_id() as first argument.
*/
struct idmap_key {