diff options
Diffstat (limited to 'kernel')
59 files changed, 1256 insertions, 676 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 93d0b87f3283..addeed3df15d 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1295,15 +1295,11 @@ out: static void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) { - int i; - if (cap_isclear(*cap)) { audit_log_format(ab, " %s=0", prefix); return; } - audit_log_format(ab, " %s=", prefix); - CAP_FOR_EACH_U32(i) - audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]); + audit_log_format(ab, " %s=%016llx", prefix, cap->val); } static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 933869983e2a..b297e9f60ca1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -34,6 +34,7 @@ #include <linux/log2.h> #include <linux/bpf_verifier.h> #include <linux/nodemask.h> +#include <linux/nospec.h> #include <linux/bpf_mem_alloc.h> #include <linux/memcontrol.h> diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 80f4b4d88aaf..8732e0aadf36 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -269,7 +269,7 @@ static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE) return -EPERM; } else { - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, @@ -290,7 +290,7 @@ static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma */ return -EPERM; } else { - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e3fcdc9836a6..adc83cb82f37 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -895,10 +895,10 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) /* set default open/close callbacks */ vma->vm_ops = &bpf_map_default_vmops; vma->vm_private_data = map; - vma->vm_flags &= ~VM_MAYEXEC; + vm_flags_clear(vma, VM_MAYEXEC); if (!(vma->vm_flags & VM_WRITE)) /* disallow re-mapping with PROT_WRITE */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); err = map->ops->map_mmap(map, vma); if (err) diff --git a/kernel/capability.c b/kernel/capability.c index 339a44dfe2f4..3e058f41df32 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -20,13 +20,6 @@ #include <linux/user_namespace.h> #include <linux/uaccess.h> -/* - * Leveraged for setting/resetting capabilities - */ - -const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; -EXPORT_SYMBOL(__cap_empty_set); - int file_caps_enabled = 1; static int __init file_caps_disable(char *str) @@ -151,6 +144,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) pid_t pid; unsigned tocopy; kernel_cap_t pE, pI, pP; + struct __user_cap_data_struct kdata[2]; ret = cap_validate_magic(header, &tocopy); if ((dataptr == NULL) || (ret != 0)) @@ -163,42 +157,46 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) return -EINVAL; ret = cap_get_target_pid(pid, &pE, &pI, &pP); - if (!ret) { - struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; - unsigned i; - - for (i = 0; i < tocopy; i++) { - kdata[i].effective = pE.cap[i]; - kdata[i].permitted = pP.cap[i]; - kdata[i].inheritable = pI.cap[i]; - } - - /* - * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S, - * we silently drop the upper capabilities here. This - * has the effect of making older libcap - * implementations implicitly drop upper capability - * bits when they perform a: capget/modify/capset - * sequence. - * - * This behavior is considered fail-safe - * behavior. Upgrading the application to a newer - * version of libcap will enable access to the newer - * capabilities. - * - * An alternative would be to return an error here - * (-ERANGE), but that causes legacy applications to - * unexpectedly fail; the capget/modify/capset aborts - * before modification is attempted and the application - * fails. - */ - if (copy_to_user(dataptr, kdata, tocopy - * sizeof(struct __user_cap_data_struct))) { - return -EFAULT; - } - } + if (ret) + return ret; - return ret; + /* + * Annoying legacy format with 64-bit capabilities exposed + * as two sets of 32-bit fields, so we need to split the + * capability values up. + */ + kdata[0].effective = pE.val; kdata[1].effective = pE.val >> 32; + kdata[0].permitted = pP.val; kdata[1].permitted = pP.val >> 32; + kdata[0].inheritable = pI.val; kdata[1].inheritable = pI.val >> 32; + + /* + * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S, + * we silently drop the upper capabilities here. This + * has the effect of making older libcap + * implementations implicitly drop upper capability + * bits when they perform a: capget/modify/capset + * sequence. + * + * This behavior is considered fail-safe + * behavior. Upgrading the application to a newer + * version of libcap will enable access to the newer + * capabilities. + * + * An alternative would be to return an error here + * (-ERANGE), but that causes legacy applications to + * unexpectedly fail; the capget/modify/capset aborts + * before modification is attempted and the application + * fails. + */ + if (copy_to_user(dataptr, kdata, tocopy * sizeof(kdata[0]))) + return -EFAULT; + + return 0; +} + +static kernel_cap_t mk_kernel_cap(u32 low, u32 high) +{ + return (kernel_cap_t) { (low | ((u64)high << 32)) & CAP_VALID_MASK }; } /** @@ -221,8 +219,8 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) */ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) { - struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; - unsigned i, tocopy, copybytes; + struct __user_cap_data_struct kdata[2] = { { 0, }, }; + unsigned tocopy, copybytes; kernel_cap_t inheritable, permitted, effective; struct cred *new; int ret; @@ -246,21 +244,9 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) if (copy_from_user(&kdata, data, copybytes)) return -EFAULT; - for (i = 0; i < tocopy; i++) { - effective.cap[i] = kdata[i].effective; - permitted.cap[i] = kdata[i].permitted; - inheritable.cap[i] = kdata[i].inheritable; - } - while (i < _KERNEL_CAPABILITY_U32S) { - effective.cap[i] = 0; - permitted.cap[i] = 0; - inheritable.cap[i] = 0; - i++; - } - - effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; - permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; - inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; + effective = mk_kernel_cap(kdata[0].effective, kdata[1].effective); + permitted = mk_kernel_cap(kdata[0].permitted, kdata[1].permitted); + inheritable = mk_kernel_cap(kdata[0].inheritable, kdata[1].inheritable); new = prepare_creds(); if (!new) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 87ef6096823f..755f5f08ab38 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -455,8 +455,8 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(page, lru); VMCOREINFO_OFFSET(page, _mapcount); VMCOREINFO_OFFSET(page, private); - VMCOREINFO_OFFSET(page, compound_dtor); - VMCOREINFO_OFFSET(page, compound_order); + VMCOREINFO_OFFSET(folio, _folio_dtor); + VMCOREINFO_OFFSET(folio, _folio_order); VMCOREINFO_OFFSET(page, compound_head); VMCOREINFO_OFFSET(pglist_data, node_zones); VMCOREINFO_OFFSET(pglist_data, nr_zones); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index a34c38bbe28f..03e3251cd9d2 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -156,14 +156,6 @@ setup_io_tlb_npages(char *str) } early_param("swiotlb", setup_io_tlb_npages); -unsigned int swiotlb_max_segment(void) -{ - if (!io_tlb_default_mem.nslabs) - return 0; - return rounddown(io_tlb_default_mem.nslabs << IO_TLB_SHIFT, PAGE_SIZE); -} -EXPORT_SYMBOL_GPL(swiotlb_max_segment); - unsigned long swiotlb_size_or_default(void) { return default_nslabs << IO_TLB_SHIFT; @@ -300,7 +292,8 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, return; } -static void *swiotlb_memblock_alloc(unsigned long nslabs, unsigned int flags, +static void __init *swiotlb_memblock_alloc(unsigned long nslabs, + unsigned int flags, int (*remap)(void *tlb, unsigned long nslabs)) { size_t bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT); diff --git a/kernel/events/core.c b/kernel/events/core.c index 7099c77bc53b..f79fd8b87f75 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6568,7 +6568,7 @@ aux_unlock: * Since pinned accounting is per vm we cannot allow fork() to copy our * vma. */ - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &perf_mmap_vmops; if (event->pmu->event_mapped) @@ -9418,6 +9418,7 @@ void perf_report_aux_output_id(struct perf_event *event, u64 hw_id) perf_output_end(&handle); } +EXPORT_SYMBOL_GPL(perf_report_aux_output_id); static int __perf_event_account_interrupt(struct perf_event *event, int throttle) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d9e357b7e17c..59887c69d54c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -22,7 +22,6 @@ #include <linux/swap.h> /* folio_free_swap */ #include <linux/ptrace.h> /* user_enable_single_step */ #include <linux/kdebug.h> /* notifier mechanism */ -#include "../../mm/internal.h" /* munlock_vma_page */ #include <linux/percpu-rwsem.h> #include <linux/task_work.h> #include <linux/shmem_fs.h> @@ -161,7 +160,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, int err; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + PAGE_SIZE); if (new_page) { @@ -1352,7 +1351,7 @@ static int delayed_ref_ctr_inc(struct vm_area_struct *vma) } /* - * Called from mmap_region/vma_adjust with mm->mmap_lock acquired. + * Called from mmap_region/vma_merge with mm->mmap_lock acquired. * * Currently we ignore all errors and always return 0, the callers * can't handle the failure anyway. diff --git a/kernel/fail_function.c b/kernel/fail_function.c index a7ccd2930c5f..d971a0189319 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -163,10 +163,7 @@ static void fei_debugfs_add_attr(struct fei_attr *attr) static void fei_debugfs_remove_attr(struct fei_attr *attr) { - struct dentry *dir; - - dir = debugfs_lookup(attr->kp.symbol_name, fei_debugfs_dir); - debugfs_remove_recursive(dir); + debugfs_lookup_and_remove(attr->kp.symbol_name, fei_debugfs_dir); } static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs) diff --git a/kernel/fork.c b/kernel/fork.c index 038b898dad52..f68954d05e89 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -472,7 +472,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) * orig->shared.rb may be modified concurrently, but the clone * will be reinitialized. */ - *new = data_race(*orig); + data_race(memcpy(new, orig, sizeof(*new))); INIT_LIST_HEAD(&new->anon_vma_chain); dup_anon_vma_name(orig, new); } @@ -585,8 +585,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, int retval; unsigned long charge = 0; LIST_HEAD(uf); - MA_STATE(old_mas, &oldmm->mm_mt, 0, 0); - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(old_vmi, oldmm, 0); + VMA_ITERATOR(vmi, mm, 0); uprobe_start_dup_mmap(); if (mmap_write_lock_killable(oldmm)) { @@ -613,11 +613,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto out; khugepaged_fork(mm, oldmm); - retval = mas_expected_entries(&mas, oldmm->map_count); + retval = vma_iter_bulk_alloc(&vmi, oldmm->map_count); if (retval) goto out; - mas_for_each(&old_mas, mpnt, ULONG_MAX) { + for_each_vma(old_vmi, mpnt) { struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { @@ -659,7 +659,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, tmp->anon_vma = NULL; } else if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); + vm_flags_clear(tmp, VM_LOCKED_MASK); file = tmp->vm_file; if (file) { struct address_space *mapping = file->f_mapping; @@ -683,11 +683,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, hugetlb_dup_vma_private(tmp); /* Link the vma into the MT */ - mas.index = tmp->vm_start; - mas.last = tmp->vm_end - 1; - mas_store(&mas, tmp); - if (mas_is_err(&mas)) - goto fail_nomem_mas_store; + if (vma_iter_bulk_store(&vmi, tmp)) + goto fail_nomem_vmi_store; mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) @@ -702,7 +699,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, /* a new mm has just been created */ retval = arch_dup_mmap(oldmm, mm); loop_out: - mas_destroy(&mas); + vma_iter_free(&vmi); out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); @@ -712,7 +709,7 @@ fail_uprobe_end: uprobe_end_dup_mmap(); return retval; -fail_nomem_mas_store: +fail_nomem_vmi_store: unlink_anon_vmas(tmp); fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 81b97f0f6556..1ef9a87511f5 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -7,7 +7,7 @@ set -e sfile="$(readlink -f "$0")" outdir="$(pwd)" tarfile=$1 -cpio_dir=$outdir/$tarfile.tmp +cpio_dir=$outdir/${tarfile%/*}/.tmp_cpio_dir dir_list=" include/ diff --git a/kernel/hung_task.c b/kernel/hung_task.c index c71889f3f3fc..322813366c6c 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -142,6 +142,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) if (sysctl_hung_task_all_cpu_backtrace) hung_task_show_all_bt = true; + if (!sysctl_hung_task_warnings) + pr_info("Future hung task reports are suppressed, see sysctl kernel.hung_task_warnings\n"); } touch_nmi_watchdog(); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index aa5b7eeeceb8..7d4fc6479062 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -470,31 +470,6 @@ struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, EXPORT_SYMBOL_GPL(irq_find_matching_fwspec); /** - * irq_domain_check_msi_remap - Check whether all MSI irq domains implement - * IRQ remapping - * - * Return: false if any MSI irq domain does not support IRQ remapping, - * true otherwise (including if there is no MSI irq domain) - */ -bool irq_domain_check_msi_remap(void) -{ - struct irq_domain *h; - bool ret = true; - - mutex_lock(&irq_domain_mutex); - list_for_each_entry(h, &irq_domain_list, link) { - if (irq_domain_is_msi(h) && - !irq_domain_hierarchical_is_msi_remap(h)) { - ret = false; - break; - } - } - mutex_unlock(&irq_domain_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(irq_domain_check_msi_remap); - -/** * irq_set_default_host() - Set a "default" irq domain * @domain: default domain pointer * @@ -1890,20 +1865,6 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain) if (domain->ops->alloc) domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY; } - -/** - * irq_domain_hierarchical_is_msi_remap - Check if the domain or any - * parent has MSI remapping support - * @domain: domain pointer - */ -bool irq_domain_hierarchical_is_msi_remap(struct irq_domain *domain) -{ - for (; domain; domain = domain->parent) { - if (irq_domain_is_msi_remap(domain)) - return true; - } - return false; -} #else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ /** * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 783a3e6a0b10..d0f0389920c1 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -1627,3 +1627,30 @@ struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain) { return (struct msi_domain_info *)domain->host_data; } + +/** + * msi_device_has_isolated_msi - True if the device has isolated MSI + * @dev: The device to check + * + * Isolated MSI means that HW modeled by an irq_domain on the path from the + * initiating device to the CPU will validate that the MSI message specifies an + * interrupt number that the device is authorized to trigger. This must block + * devices from triggering interrupts they are not authorized to trigger. + * Currently authorization means the MSI vector is one assigned to the device. + * + * This is interesting for securing VFIO use cases where a rouge MSI (eg created + * by abusing a normal PCI MemWr DMA) must not allow the VFIO userspace to + * impact outside its security domain, eg userspace triggering interrupts on + * kernel drivers, a VM triggering interrupts on the hypervisor, or a VM + * triggering interrupts on another VM. + */ +bool msi_device_has_isolated_msi(struct device *dev) +{ + struct irq_domain *domain = dev_get_msi_domain(dev); + + for (; domain; domain = domain->parent) + if (domain->flags & IRQ_DOMAIN_FLAG_ISOLATED_MSI) + return true; + return arch_is_isolated_msi(); +} +EXPORT_SYMBOL_GPL(msi_device_has_isolated_msi); diff --git a/kernel/kcov.c b/kernel/kcov.c index e5cd09fd8a05..84c717337df0 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -489,7 +489,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) goto exit; } spin_unlock_irqrestore(&kcov->lock, flags); - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTEXPAND); for (off = 0; off < size; off += PAGE_SIZE) { page = vmalloc_to_page(kcov->area + off); res = vm_insert_page(vma, vma->vm_start + off, page); diff --git a/kernel/kexec.c b/kernel/kexec.c index cb8e6e6f983c..92d301f98776 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -190,10 +190,12 @@ out_unlock: static inline int kexec_load_check(unsigned long nr_segments, unsigned long flags) { + int image_type = (flags & KEXEC_ON_CRASH) ? + KEXEC_TYPE_CRASH : KEXEC_TYPE_DEFAULT; int result; /* We only trust the superuser with rebooting the system. */ - if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) + if (!kexec_load_permitted(image_type)) return -EPERM; /* Permit LSMs and IMA to fail the kexec */ diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index b1cf259854ca..3d578c6fefee 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -921,10 +921,64 @@ int kimage_load_segment(struct kimage *image, return result; } +struct kexec_load_limit { + /* Mutex protects the limit count. */ + struct mutex mutex; + int limit; +}; + +static struct kexec_load_limit load_limit_reboot = { + .mutex = __MUTEX_INITIALIZER(load_limit_reboot.mutex), + .limit = -1, +}; + +static struct kexec_load_limit load_limit_panic = { + .mutex = __MUTEX_INITIALIZER(load_limit_panic.mutex), + .limit = -1, +}; + struct kimage *kexec_image; struct kimage *kexec_crash_image; -int kexec_load_disabled; +static int kexec_load_disabled; + #ifdef CONFIG_SYSCTL +static int kexec_limit_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct kexec_load_limit *limit = table->data; + int val; + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(val), + .mode = table->mode, + }; + int ret; + + if (write) { + ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); + if (ret) + return ret; + + if (val < 0) + return -EINVAL; + + mutex_lock(&limit->mutex); + if (limit->limit != -1 && val >= limit->limit) + ret = -EINVAL; + else + limit->limit = val; + mutex_unlock(&limit->mutex); + + return ret; + } + + mutex_lock(&limit->mutex); + val = limit->limit; + mutex_unlock(&limit->mutex); + + return proc_dointvec(&tmp, write, buffer, lenp, ppos); +} + static struct ctl_table kexec_core_sysctls[] = { { .procname = "kexec_load_disabled", @@ -936,6 +990,18 @@ static struct ctl_table kexec_core_sysctls[] = { .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_ONE, }, + { + .procname = "kexec_load_limit_panic", + .data = &load_limit_panic, + .mode = 0644, + .proc_handler = kexec_limit_handler, + }, + { + .procname = "kexec_load_limit_reboot", + .data = &load_limit_reboot, + .mode = 0644, + .proc_handler = kexec_limit_handler, + }, { } }; @@ -947,6 +1013,32 @@ static int __init kexec_core_sysctl_init(void) late_initcall(kexec_core_sysctl_init); #endif +bool kexec_load_permitted(int kexec_image_type) +{ + struct kexec_load_limit *limit; + + /* + * Only the superuser can use the kexec syscall and if it has not + * been disabled. + */ + if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) + return false; + + /* Check limit counter and decrease it.*/ + limit = (kexec_image_type == KEXEC_TYPE_CRASH) ? + &load_limit_panic : &load_limit_reboot; + mutex_lock(&limit->mutex); + if (!limit->limit) { + mutex_unlock(&limit->mutex); + return false; + } + if (limit->limit != -1) + limit->limit--; + mutex_unlock(&limit->mutex); + + return true; +} + /* * No panic_cpu check version of crash_kexec(). This function is called * only when panic_cpu holds the current CPU number; this is the only CPU diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index dd5983010b7b..f1a0e4e3fb5c 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -326,11 +326,13 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, unsigned long, cmdline_len, const char __user *, cmdline_ptr, unsigned long, flags) { - int ret = 0, i; + int image_type = (flags & KEXEC_FILE_ON_CRASH) ? + KEXEC_TYPE_CRASH : KEXEC_TYPE_DEFAULT; struct kimage **dest_image, *image; + int ret = 0, i; /* We only trust the superuser with rebooting the system. */ - if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) + if (!kexec_load_permitted(image_type)) return -EPERM; /* Make sure we have a legal set of flags */ @@ -342,11 +344,12 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if (!kexec_trylock()) return -EBUSY; - dest_image = &kexec_image; - if (flags & KEXEC_FILE_ON_CRASH) { + if (image_type == KEXEC_TYPE_CRASH) { dest_image = &kexec_crash_image; if (kexec_crash_image) arch_kexec_unprotect_crashkres(); + } else { + dest_image = &kexec_image; } if (flags & KEXEC_FILE_UNLOAD) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1c18ecf9f98b..00e177de91cc 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -458,7 +458,7 @@ static inline int kprobe_optready(struct kprobe *p) } /* Return true if the kprobe is disarmed. Note: p must be on hash list */ -static inline bool kprobe_disarmed(struct kprobe *p) +bool kprobe_disarmed(struct kprobe *p) { struct optimized_kprobe *op; @@ -555,17 +555,15 @@ static void do_unoptimize_kprobes(void) /* See comment in do_optimize_kprobes() */ lockdep_assert_cpus_held(); - /* Unoptimization must be done anytime */ - if (list_empty(&unoptimizing_list)) - return; + if (!list_empty(&unoptimizing_list)) + arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); - arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); - /* Loop on 'freeing_list' for disarming */ + /* Loop on 'freeing_list' for disarming and removing from kprobe hash list */ list_for_each_entry_safe(op, tmp, &freeing_list, list) { /* Switching from detour code to origin */ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; - /* Disarm probes if marked disabled */ - if (kprobe_disabled(&op->kp)) + /* Disarm probes if marked disabled and not gone */ + if (kprobe_disabled(&op->kp) && !kprobe_gone(&op->kp)) arch_disarm_kprobe(&op->kp); if (kprobe_unused(&op->kp)) { /* @@ -662,7 +660,7 @@ void wait_for_kprobe_optimizer(void) mutex_unlock(&kprobe_mutex); } -static bool optprobe_queued_unopt(struct optimized_kprobe *op) +bool optprobe_queued_unopt(struct optimized_kprobe *op) { struct optimized_kprobe *_op; @@ -797,14 +795,13 @@ static void kill_optimized_kprobe(struct kprobe *p) op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; if (kprobe_unused(p)) { - /* Enqueue if it is unused */ - list_add(&op->list, &freeing_list); /* - * Remove unused probes from the hash list. After waiting - * for synchronization, this probe is reclaimed. - * (reclaiming is done by do_free_cleaned_kprobes().) + * Unused kprobe is on unoptimizing or freeing list. We move it + * to freeing_list and let the kprobe_optimizer() remove it from + * the kprobe hash list and free it. */ - hlist_del_rcu(&op->kp.hlist); + if (optprobe_queued_unopt(op)) + list_move(&op->list, &freeing_list); } /* Don't touch the code, because it is already freed. */ diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 2df00b789b90..0408aab80941 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -51,6 +51,14 @@ static ssize_t cpu_byteorder_show(struct kobject *kobj, } KERNEL_ATTR_RO(cpu_byteorder); +/* address bits */ +static ssize_t address_bits_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%zu\n", sizeof(void *) * 8 /* CHAR_BIT */); +} +KERNEL_ATTR_RO(address_bits); + #ifdef CONFIG_UEVENT_HELPER /* uevent helper program, used during early boot */ static ssize_t uevent_helper_show(struct kobject *kobj, @@ -233,6 +241,7 @@ static struct attribute * kernel_attrs[] = { &fscaps_attr.attr, &uevent_seqnum_attr.attr, &cpu_byteorder_attr.attr, + &address_bits_attr.attr, #ifdef CONFIG_UEVENT_HELPER &uevent_helper_attr.attr, #endif diff --git a/kernel/kthread.c b/kernel/kthread.c index f97fd01a2932..7e6751b29101 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1382,6 +1382,10 @@ EXPORT_SYMBOL_GPL(kthread_flush_worker); * Flush and destroy @worker. The simple flush is enough because the kthread * worker API is used only in trivial scenarios. There are no multi-step state * machines needed. + * + * Note that this function is not responsible for handling delayed work, so + * caller should be responsible for queuing or canceling all delayed work items + * before invoke this function. */ void kthread_destroy_worker(struct kthread_worker *worker) { @@ -1393,6 +1397,7 @@ void kthread_destroy_worker(struct kthread_worker *worker) kthread_flush_worker(worker); kthread_stop(task); + WARN_ON(!list_empty(&worker->delayed_work_list)); WARN_ON(!list_empty(&worker->work_list)); kfree(worker); } diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index c973ed9e42f8..4bd2d5e10f20 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -260,6 +260,14 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab, return 0; } +void __weak clear_relocate_add(Elf_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ +} + /* * At a high-level, there are two types of klp relocation sections: those which * reference symbols which live in vmlinux; and those which reference symbols @@ -283,10 +291,10 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab, * the to-be-patched module to be loaded and patched sometime *after* the * klp module is loaded. */ -int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs, - const char *shstrtab, const char *strtab, - unsigned int symndx, unsigned int secndx, - const char *objname) +static int klp_write_section_relocs(struct module *pmod, Elf_Shdr *sechdrs, + const char *shstrtab, const char *strtab, + unsigned int symndx, unsigned int secndx, + const char *objname, bool apply) { int cnt, ret; char sec_objname[MODULE_NAME_LEN]; @@ -308,11 +316,26 @@ int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs, if (strcmp(objname ? objname : "vmlinux", sec_objname)) return 0; - ret = klp_resolve_symbols(sechdrs, strtab, symndx, sec, sec_objname); - if (ret) - return ret; + if (apply) { + ret = klp_resolve_symbols(sechdrs, strtab, symndx, + sec, sec_objname); + if (ret) + return ret; + + return apply_relocate_add(sechdrs, strtab, symndx, secndx, pmod); + } + + clear_relocate_add(sechdrs, strtab, symndx, secndx, pmod); + return 0; +} - return apply_relocate_add(sechdrs, strtab, symndx, secndx, pmod); +int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs, + const char *shstrtab, const char *strtab, + unsigned int symndx, unsigned int secndx, + const char *objname) +{ + return klp_write_section_relocs(pmod, sechdrs, shstrtab, strtab, symndx, + secndx, objname, true); } /* @@ -761,8 +784,9 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) func->old_sympos ? func->old_sympos : 1); } -static int klp_apply_object_relocs(struct klp_patch *patch, - struct klp_object *obj) +static int klp_write_object_relocs(struct klp_patch *patch, + struct klp_object *obj, + bool apply) { int i, ret; struct klp_modinfo *info = patch->mod->klp_info; @@ -773,10 +797,10 @@ static int klp_apply_object_relocs(struct klp_patch *patch, if (!(sec->sh_flags & SHF_RELA_LIVEPATCH)) continue; - ret = klp_apply_section_relocs(patch->mod, info->sechdrs, + ret = klp_write_section_relocs(patch->mod, info->sechdrs, info->secstrings, patch->mod->core_kallsyms.strtab, - info->symndx, i, obj->name); + info->symndx, i, obj->name, apply); if (ret) return ret; } @@ -784,6 +808,18 @@ static int klp_apply_object_relocs(struct klp_patch *patch, return 0; } +static int klp_apply_object_relocs(struct klp_patch *patch, + struct klp_object *obj) +{ + return klp_write_object_relocs(patch, obj, true); +} + +static void klp_clear_object_relocs(struct klp_patch *patch, + struct klp_object *obj) +{ + klp_write_object_relocs(patch, obj, false); +} + /* parts of the initialization that is done only when the object is loaded */ static int klp_init_object_loaded(struct klp_patch *patch, struct klp_object *obj) @@ -1171,7 +1207,7 @@ static void klp_cleanup_module_patches_limited(struct module *mod, klp_unpatch_object(obj); klp_post_unpatch_callback(obj); - + klp_clear_object_relocs(patch, obj); klp_free_object_loaded(obj); break; } diff --git a/kernel/module/main.c b/kernel/module/main.c index 4ac3fe43e6c8..d3be89de706d 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -17,6 +17,7 @@ #include <linux/fs.h> #include <linux/kernel.h> #include <linux/kernel_read_file.h> +#include <linux/kstrtox.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/elf.h> @@ -2675,7 +2676,7 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname, int ret; if (strcmp(param, "async_probe") == 0) { - if (strtobool(val, &mod->async_probe_requested)) + if (kstrtobool(val, &mod->async_probe_requested)) mod->async_probe_requested = true; return 0; } diff --git a/kernel/params.c b/kernel/params.c index 14d66070757b..6e34ca89ebae 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -4,6 +4,7 @@ */ #include <linux/kernel.h> +#include <linux/kstrtox.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/module.h> @@ -310,7 +311,7 @@ int param_set_bool(const char *val, const struct kernel_param *kp) if (!val) val = "1"; /* One of =[yYnN01] */ - return strtobool(val, kp->arg); + return kstrtobool(val, kp->arg); } EXPORT_SYMBOL(param_set_bool); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index fc21c5d5fd5d..46e0d5a3f91f 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -23,6 +23,7 @@ #include <linux/sched/task.h> #include <linux/sched/signal.h> #include <linux/idr.h> +#include "pid_sysctl.h" static DEFINE_MUTEX(pid_caches_mutex); static struct kmem_cache *pid_ns_cachep; @@ -110,6 +111,8 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->ucounts = ucounts; ns->pid_allocated = PIDNS_ADDING; + initialize_memfd_noexec_scope(ns); + return ns; out_free_idr: @@ -472,6 +475,8 @@ static __init int pid_namespaces_init(void) #ifdef CONFIG_CHECKPOINT_RESTORE register_sysctl_paths(kern_path, pid_ns_ctl_table); #endif + + register_pid_ns_sysctl_table_vm(); return 0; } diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h new file mode 100644 index 000000000000..e22d072e1e24 --- /dev/null +++ b/kernel/pid_sysctl.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_PID_SYSCTL_H +#define LINUX_PID_SYSCTL_H + +#include <linux/pid_namespace.h> + +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) +static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns) +{ + ns->memfd_noexec_scope = + task_active_pid_ns(current)->memfd_noexec_scope; +} + +static int pid_mfd_noexec_dointvec_minmax(struct ctl_table *table, + int write, void *buf, size_t *lenp, loff_t *ppos) +{ + struct pid_namespace *ns = task_active_pid_ns(current); + struct ctl_table table_copy; + + if (write && !ns_capable(ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + table_copy = *table; + if (ns != &init_pid_ns) + table_copy.data = &ns->memfd_noexec_scope; + + /* + * set minimum to current value, the effect is only bigger + * value is accepted. + */ + if (*(int *)table_copy.data > *(int *)table_copy.extra1) + table_copy.extra1 = table_copy.data; + + return proc_dointvec_minmax(&table_copy, write, buf, lenp, ppos); +} + +static struct ctl_table pid_ns_ctl_table_vm[] = { + { + .procname = "memfd_noexec", + .data = &init_pid_ns.memfd_noexec_scope, + .maxlen = sizeof(init_pid_ns.memfd_noexec_scope), + .mode = 0644, + .proc_handler = pid_mfd_noexec_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { } +}; +static struct ctl_path vm_path[] = { { .procname = "vm", }, { } }; +static inline void register_pid_ns_sysctl_table_vm(void) +{ + register_sysctl_paths(vm_path, pid_ns_ctl_table_vm); +} +#else +static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns) {} +static inline void set_memfd_noexec_scope(struct pid_namespace *ns) {} +static inline void register_pid_ns_sysctl_table_vm(void) {} +#endif + +#endif /* LINUX_PID_SYSCTL_H */ diff --git a/kernel/printk/index.c b/kernel/printk/index.c index c85be186a783..a6b27526baaf 100644 --- a/kernel/printk/index.c +++ b/kernel/printk/index.c @@ -145,7 +145,7 @@ static void pi_create_file(struct module *mod) #ifdef CONFIG_MODULES static void pi_remove_file(struct module *mod) { - debugfs_remove(debugfs_lookup(pi_get_module_name(mod), dfs_index)); + debugfs_lookup_and_remove(pi_get_module_name(mod), dfs_index); } static int pi_module_notify(struct notifier_block *nb, unsigned long op, diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index d947ca6c84f9..2a17704136f1 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -14,6 +14,21 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, #ifdef CONFIG_PRINTK +#ifdef CONFIG_PRINTK_CALLER +#define PRINTK_PREFIX_MAX 48 +#else +#define PRINTK_PREFIX_MAX 32 +#endif + +/* + * the maximum size of a formatted record (i.e. with prefix added + * per line and dropped messages or in extended message format) + */ +#define PRINTK_MESSAGE_MAX 2048 + +/* the maximum size allowed to be reserved for a record */ +#define PRINTKRB_RECORD_MAX 1024 + /* Flags for a single printk record. */ enum printk_info_flags { LOG_NEWLINE = 2, /* text ended with a newline */ @@ -48,6 +63,10 @@ u16 printk_parse_prefix(const char *text, int *level, enum printk_info_flags *flags); #else +#define PRINTK_PREFIX_MAX 0 +#define PRINTK_MESSAGE_MAX 0 +#define PRINTKRB_RECORD_MAX 0 + /* * In !PRINTK builds we still export console_sem * semaphore and some of console functions (console_unlock()/etc.), so @@ -58,3 +77,29 @@ u16 printk_parse_prefix(const char *text, int *level, static inline bool printk_percpu_data_ready(void) { return false; } #endif /* CONFIG_PRINTK */ + +/** + * struct printk_buffers - Buffers to read/format/output printk messages. + * @outbuf: After formatting, contains text to output. + * @scratchbuf: Used as temporary ringbuffer reading and string-print space. + */ +struct printk_buffers { + char outbuf[PRINTK_MESSAGE_MAX]; + char scratchbuf[PRINTKRB_RECORD_MAX]; +}; + +/** + * struct printk_message - Container for a prepared printk message. + * @pbufs: printk buffers used to prepare the message. + * @outbuf_len: The length of prepared text in @pbufs->outbuf to output. This + * does not count the terminator. A value of 0 means there is + * nothing to output and this record should be skipped. + * @seq: The sequence number of the record used for @pbufs->outbuf. + * @dropped: The number of dropped records from reading @seq. + */ +struct printk_message { + struct printk_buffers *pbufs; + unsigned int outbuf_len; + u64 seq; + unsigned long dropped; +}; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 94f136b25f6a..fd0c9f913940 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -466,21 +466,6 @@ static struct latched_seq clear_seq = { .val[1] = 0, }; -#ifdef CONFIG_PRINTK_CALLER -#define PREFIX_MAX 48 -#else -#define PREFIX_MAX 32 -#endif - -/* the maximum size of a formatted record (i.e. with prefix added per line) */ -#define CONSOLE_LOG_MAX 1024 - -/* the maximum size for a dropped text message */ -#define DROPPED_TEXT_MAX 64 - -/* the maximum size allowed to be reserved for a record */ -#define LOG_LINE_MAX (CONSOLE_LOG_MAX - PREFIX_MAX) - #define LOG_LEVEL(v) ((v) & 0x07) #define LOG_FACILITY(v) ((v) >> 3 & 0xff) @@ -711,16 +696,15 @@ out: return len; } +static bool printk_get_next_message(struct printk_message *pmsg, u64 seq, + bool is_extended, bool may_supress); + /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { atomic64_t seq; struct ratelimit_state rs; struct mutex lock; - char buf[CONSOLE_EXT_LOG_MAX]; - - struct printk_info info; - char text_buf[CONSOLE_EXT_LOG_MAX]; - struct printk_record record; + struct printk_buffers pbufs; }; static __printf(3, 4) __cold @@ -746,7 +730,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) size_t len = iov_iter_count(from); ssize_t ret = len; - if (!user || len > LOG_LINE_MAX) + if (!user || len > PRINTKRB_RECORD_MAX) return -EINVAL; /* Ignore when user logging is disabled. */ @@ -802,8 +786,10 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct devkmsg_user *user = file->private_data; - struct printk_record *r = &user->record; - size_t len; + char *outbuf = &user->pbufs.outbuf[0]; + struct printk_message pmsg = { + .pbufs = &user->pbufs, + }; ssize_t ret; if (!user) @@ -813,7 +799,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, if (ret) return ret; - if (!prb_read_valid(prb, atomic64_read(&user->seq), r)) { + if (!printk_get_next_message(&pmsg, atomic64_read(&user->seq), true, false)) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; goto out; @@ -830,36 +816,31 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, * This pairs with __wake_up_klogd:A. */ ret = wait_event_interruptible(log_wait, - prb_read_valid(prb, - atomic64_read(&user->seq), r)); /* LMM(devkmsg_read:A) */ + printk_get_next_message(&pmsg, atomic64_read(&user->seq), true, + false)); /* LMM(devkmsg_read:A) */ if (ret) goto out; } - if (r->info->seq != atomic64_read(&user->seq)) { + if (pmsg.dropped) { /* our last seen message is gone, return error and reset */ - atomic64_set(&user->seq, r->info->seq); + atomic64_set(&user->seq, pmsg.seq); ret = -EPIPE; goto out; } - len = info_print_ext_header(user->buf, sizeof(user->buf), r->info); - len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len, - &r->text_buf[0], r->info->text_len, - &r->info->dev_info); - - atomic64_set(&user->seq, r->info->seq + 1); + atomic64_set(&user->seq, pmsg.seq + 1); - if (len > count) { + if (pmsg.outbuf_len > count) { ret = -EINVAL; goto out; } - if (copy_to_user(buf, user->buf, len)) { + if (copy_to_user(buf, outbuf, pmsg.outbuf_len)) { ret = -EFAULT; goto out; } - ret = len; + ret = pmsg.outbuf_len; out: mutex_unlock(&user->lock); return ret; @@ -953,9 +934,6 @@ static int devkmsg_open(struct inode *inode, struct file *file) mutex_init(&user->lock); - prb_rec_init_rd(&user->record, &user->info, - &user->text_buf[0], sizeof(user->text_buf)); - atomic64_set(&user->seq, prb_first_valid_seq(prb)); file->private_data = user; @@ -1150,7 +1128,7 @@ static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, return prb_record_text_space(&e); } -static char setup_text_buf[LOG_LINE_MAX] __initdata; +static char setup_text_buf[PRINTKRB_RECORD_MAX] __initdata; void __init setup_log_buf(int early) { @@ -1416,7 +1394,7 @@ static size_t record_print_text(struct printk_record *r, bool syslog, size_t text_len = r->info->text_len; size_t buf_size = r->text_buf_size; char *text = r->text_buf; - char prefix[PREFIX_MAX]; + char prefix[PRINTK_PREFIX_MAX]; bool truncated = false; size_t prefix_len; size_t line_len; @@ -1515,7 +1493,7 @@ static size_t get_record_print_text_size(struct printk_info *info, unsigned int line_count, bool syslog, bool time) { - char prefix[PREFIX_MAX]; + char prefix[PRINTK_PREFIX_MAX]; size_t prefix_len; prefix_len = info_print_prefix(info, syslog, time, prefix); @@ -1581,11 +1559,11 @@ static int syslog_print(char __user *buf, int size) int len = 0; u64 seq; - text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL); + text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL); if (!text) return -ENOMEM; - prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); + prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX); mutex_lock(&syslog_lock); @@ -1686,7 +1664,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) u64 seq; bool time; - text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL); + text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL); if (!text) return -ENOMEM; @@ -1698,7 +1676,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1, size, true, time); - prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); + prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX); len = 0; prb_for_each_record(seq, prb, seq, &r) { @@ -2013,27 +1991,6 @@ static int console_trylock_spinning(void) } /* - * Call the specified console driver, asking it to write out the specified - * text and length. If @dropped_text is non-NULL and any records have been - * dropped, a dropped message will be written out first. - */ -static void call_console_driver(struct console *con, const char *text, size_t len, - char *dropped_text) -{ - size_t dropped_len; - - if (con->dropped && dropped_text) { - dropped_len = snprintf(dropped_text, DROPPED_TEXT_MAX, - "** %lu printk messages dropped **\n", - con->dropped); - con->dropped = 0; - con->write(con, dropped_text, dropped_len); - } - - con->write(con, text, len); -} - -/* * Recursion is tracked separately on each CPU. If NMIs are supported, an * additional NMI context per CPU is also separately tracked. Until per-CPU * is available, a separate "early tracking" is performed. @@ -2243,8 +2200,8 @@ int vprintk_store(int facility, int level, reserve_size = vsnprintf(&prefix_buf[0], sizeof(prefix_buf), fmt, args2) + 1; va_end(args2); - if (reserve_size > LOG_LINE_MAX) - reserve_size = LOG_LINE_MAX; + if (reserve_size > PRINTKRB_RECORD_MAX) + reserve_size = PRINTKRB_RECORD_MAX; /* Extract log level or control flags. */ if (facility == 0) @@ -2258,7 +2215,7 @@ int vprintk_store(int facility, int level, if (flags & LOG_CONT) { prb_rec_init_wr(&r, reserve_size); - if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { + if (prb_reserve_in_last(&e, prb, &r, caller_id, PRINTKRB_RECORD_MAX)) { text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size, facility, &flags, fmt, args); r.info->text_len += text_len; @@ -2389,8 +2346,6 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre #else /* CONFIG_PRINTK */ -#define CONSOLE_LOG_MAX 0 -#define DROPPED_TEXT_MAX 0 #define printk_time false #define prb_read_valid(rb, seq, r) false @@ -2414,10 +2369,6 @@ static ssize_t msg_print_ext_body(char *buf, size_t size, struct dev_printk_info *dev_info) { return 0; } static void console_lock_spinning_enable(void) { } static int console_lock_spinning_disable_and_check(int cookie) { return 0; } -static void call_console_driver(struct console *con, const char *text, size_t len, - char *dropped_text) -{ -} static bool suppress_message_printing(int level) { return false; } static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; } static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } @@ -2744,16 +2695,136 @@ static void __console_unlock(void) } /* - * Print one record for the given console. The record printed is whatever - * record is the next available record for the given console. + * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". This + * is achieved by shifting the existing message over and inserting the dropped + * message. + * + * @pmsg is the printk message to prepend. * - * @text is a buffer of size CONSOLE_LOG_MAX. + * @dropped is the dropped count to report in the dropped message. * - * If extended messages should be printed, @ext_text is a buffer of size - * CONSOLE_EXT_LOG_MAX. Otherwise @ext_text must be NULL. + * If the message text in @pmsg->pbufs->outbuf does not have enough space for + * the dropped message, the message text will be sufficiently truncated. * - * If dropped messages should be printed, @dropped_text is a buffer of size - * DROPPED_TEXT_MAX. Otherwise @dropped_text must be NULL. + * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated. + */ +#ifdef CONFIG_PRINTK +static void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) +{ + struct printk_buffers *pbufs = pmsg->pbufs; + const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); + const size_t outbuf_sz = sizeof(pbufs->outbuf); + char *scratchbuf = &pbufs->scratchbuf[0]; + char *outbuf = &pbufs->outbuf[0]; + size_t len; + + len = scnprintf(scratchbuf, scratchbuf_sz, + "** %lu printk messages dropped **\n", dropped); + + /* + * Make sure outbuf is sufficiently large before prepending. + * Keep at least the prefix when the message must be truncated. + * It is a rather theoretical problem when someone tries to + * use a minimalist buffer. + */ + if (WARN_ON_ONCE(len + PRINTK_PREFIX_MAX >= outbuf_sz)) + return; + + if (pmsg->outbuf_len + len >= outbuf_sz) { + /* Truncate the message, but keep it terminated. */ + pmsg->outbuf_len = outbuf_sz - (len + 1); + outbuf[pmsg->outbuf_len] = 0; + } + + memmove(outbuf + len, outbuf, pmsg->outbuf_len + 1); + memcpy(outbuf, scratchbuf, len); + pmsg->outbuf_len += len; +} +#else +#define console_prepend_dropped(pmsg, dropped) +#endif /* CONFIG_PRINTK */ + +/* + * Read and format the specified record (or a later record if the specified + * record is not available). + * + * @pmsg will contain the formatted result. @pmsg->pbufs must point to a + * struct printk_buffers. + * + * @seq is the record to read and format. If it is not available, the next + * valid record is read. + * + * @is_extended specifies if the message should be formatted for extended + * console output. + * + * @may_supress specifies if records may be skipped based on loglevel. + * + * Returns false if no record is available. Otherwise true and all fields + * of @pmsg are valid. (See the documentation of struct printk_message + * for information about the @pmsg fields.) + */ +static bool printk_get_next_message(struct printk_message *pmsg, u64 seq, + bool is_extended, bool may_suppress) +{ + static int panic_console_dropped; + + struct printk_buffers *pbufs = pmsg->pbufs; + const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); + const size_t outbuf_sz = sizeof(pbufs->outbuf); + char *scratchbuf = &pbufs->scratchbuf[0]; + char *outbuf = &pbufs->outbuf[0]; + struct printk_info info; + struct printk_record r; + size_t len = 0; + + /* + * Formatting extended messages requires a separate buffer, so use the + * scratch buffer to read in the ringbuffer text. + * + * Formatting normal messages is done in-place, so read the ringbuffer + * text directly into the output buffer. + */ + if (is_extended) + prb_rec_init_rd(&r, &info, scratchbuf, scratchbuf_sz); + else + prb_rec_init_rd(&r, &info, outbuf, outbuf_sz); + + if (!prb_read_valid(prb, seq, &r)) + return false; + + pmsg->seq = r.info->seq; + pmsg->dropped = r.info->seq - seq; + + /* + * Check for dropped messages in panic here so that printk + * suppression can occur as early as possible if necessary. + */ + if (pmsg->dropped && + panic_in_progress() && + panic_console_dropped++ > 10) { + suppress_panic_printk = 1; + pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n"); + } + + /* Skip record that has level above the console loglevel. */ + if (may_suppress && suppress_message_printing(r.info->level)) + goto out; + + if (is_extended) { + len = info_print_ext_header(outbuf, outbuf_sz, r.info); + len += msg_print_ext_body(outbuf + len, outbuf_sz - len, + &r.text_buf[0], r.info->text_len, &r.info->dev_info); + } else { + len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); + } +out: + pmsg->outbuf_len = len; + return true; +} + +/* + * Print one record for the given console. The record printed is whatever + * record is the next available record for the given console. * * @handover will be set to true if a printk waiter has taken over the * console_lock, in which case the caller is no longer holding both the @@ -2766,46 +2837,33 @@ static void __console_unlock(void) * * Requires the console_lock and the SRCU read lock. */ -static bool console_emit_next_record(struct console *con, char *text, char *ext_text, - char *dropped_text, bool *handover, int cookie) +static bool console_emit_next_record(struct console *con, bool *handover, int cookie) { - static int panic_console_dropped; - struct printk_info info; - struct printk_record r; - unsigned long flags; - char *write_text; - size_t len; + static struct printk_buffers pbufs; - prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); + bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED; + char *outbuf = &pbufs.outbuf[0]; + struct printk_message pmsg = { + .pbufs = &pbufs, + }; + unsigned long flags; *handover = false; - if (!prb_read_valid(prb, con->seq, &r)) + if (!printk_get_next_message(&pmsg, con->seq, is_extended, true)) return false; - if (con->seq != r.info->seq) { - con->dropped += r.info->seq - con->seq; - con->seq = r.info->seq; - if (panic_in_progress() && panic_console_dropped++ > 10) { - suppress_panic_printk = 1; - pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n"); - } - } + con->dropped += pmsg.dropped; - /* Skip record that has level above the console loglevel. */ - if (suppress_message_printing(r.info->level)) { - con->seq++; + /* Skip messages of formatted length 0. */ + if (pmsg.outbuf_len == 0) { + con->seq = pmsg.seq + 1; goto skip; } - if (ext_text) { - write_text = ext_text; - len = info_print_ext_header(ext_text, CONSOLE_EXT_LOG_MAX, r.info); - len += msg_print_ext_body(ext_text + len, CONSOLE_EXT_LOG_MAX - len, - &r.text_buf[0], r.info->text_len, &r.info->dev_info); - } else { - write_text = text; - len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); + if (con->dropped && !is_extended) { + console_prepend_dropped(&pmsg, con->dropped); + con->dropped = 0; } /* @@ -2821,11 +2879,15 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_ printk_safe_enter_irqsave(flags); console_lock_spinning_enable(); - stop_critical_timings(); /* don't trace print latency */ - call_console_driver(con, write_text, len, dropped_text); + /* Do not trace print latency. */ + stop_critical_timings(); + + /* Write everything out to the hardware. */ + con->write(con, outbuf, pmsg.outbuf_len); + start_critical_timings(); - con->seq++; + con->seq = pmsg.seq + 1; *handover = console_lock_spinning_disable_and_check(cookie); printk_safe_exit_irqrestore(flags); @@ -2858,9 +2920,6 @@ skip: */ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover) { - static char dropped_text[DROPPED_TEXT_MAX]; - static char ext_text[CONSOLE_EXT_LOG_MAX]; - static char text[CONSOLE_LOG_MAX]; bool any_usable = false; struct console *con; bool any_progress; @@ -2880,16 +2939,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove continue; any_usable = true; - if (console_srcu_read_flags(con) & CON_EXTENDED) { - /* Extended consoles do not print "dropped messages". */ - progress = console_emit_next_record(con, &text[0], - &ext_text[0], NULL, - handover, cookie); - } else { - progress = console_emit_next_record(con, &text[0], - NULL, &dropped_text[0], - handover, cookie); - } + progress = console_emit_next_record(con, handover, cookie); /* * If a handover has occurred, the SRCU read lock diff --git a/kernel/relay.c b/kernel/relay.c index ef12532168d9..9aa70ae53d24 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -91,7 +91,7 @@ static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) return -EINVAL; vma->vm_ops = &relay_file_mmap_ops; - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTEXPAND); vma->vm_private_data = buf; return 0; diff --git a/kernel/resource.c b/kernel/resource.c index ddbbacb9fb50..b1763b2fd7ef 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1343,20 +1343,6 @@ retry: continue; } - /* - * All memory regions added from memory-hotplug path have the - * flag IORESOURCE_SYSTEM_RAM. If the resource does not have - * this flag, we know that we are dealing with a resource coming - * from HMM/devm. HMM/devm use another mechanism to add/release - * a resource. This goes via devm_request_mem_region and - * devm_release_mem_region. - * HMM/devm take care to release their resources when they want, - * so if we are dealing with them, let us just back off here. - */ - if (!(res->flags & IORESOURCE_SYSRAM)) { - break; - } - if (!(res->flags & IORESOURCE_MEM)) break; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ff4dbbae3b10..7a1b1f855b96 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2938,11 +2938,11 @@ static void task_numa_work(struct callback_head *work) struct task_struct *p = current; struct mm_struct *mm = p->mm; u64 runtime = p->se.sum_exec_runtime; - MA_STATE(mas, &mm->mm_mt, 0, 0); struct vm_area_struct *vma; unsigned long start, end; unsigned long nr_pte_updates = 0; long pages, virtpages; + struct vma_iterator vmi; SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); @@ -2995,16 +2995,16 @@ static void task_numa_work(struct callback_head *work) if (!mmap_read_trylock(mm)) return; - mas_set(&mas, start); - vma = mas_find(&mas, ULONG_MAX); + vma_iter_init(&vmi, mm, start); + vma = vma_next(&vmi); if (!vma) { reset_ptenuma_scan(p); start = 0; - mas_set(&mas, start); - vma = mas_find(&mas, ULONG_MAX); + vma_iter_set(&vmi, start); + vma = vma_next(&vmi); } - for (; vma; vma = mas_find(&mas, ULONG_MAX)) { + do { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { continue; @@ -3051,7 +3051,7 @@ static void task_numa_work(struct callback_head *work) cond_resched(); } while (end != vma->vm_end); - } + } for_each_vma(vmi, vma); out: /* diff --git a/kernel/sys.c b/kernel/sys.c index 88b31f096fb2..495cd87d9bf4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2350,6 +2350,33 @@ static int prctl_set_vma(unsigned long opt, unsigned long start, } #endif /* CONFIG_ANON_VMA_NAME */ +static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3, + unsigned long arg4, unsigned long arg5) +{ + if (arg3 || arg4 || arg5) + return -EINVAL; + + if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN)) + return -EINVAL; + + if (bits & PR_MDWE_REFUSE_EXEC_GAIN) + set_bit(MMF_HAS_MDWE, ¤t->mm->flags); + else if (test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + return -EPERM; /* Cannot unset the flag */ + + return 0; +} + +static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5) +{ + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + + return test_bit(MMF_HAS_MDWE, ¤t->mm->flags) ? + PR_MDWE_REFUSE_EXEC_GAIN : 0; +} + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2625,6 +2652,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = sched_core_share_pid(arg2, arg3, arg4, arg5); break; #endif + case PR_SET_MDWE: + error = prctl_set_mdwe(arg2, arg3, arg4, arg5); + break; + case PR_GET_MDWE: + error = prctl_get_mdwe(arg2, arg3, arg4, arg5); + break; case PR_SET_VMA: error = prctl_set_vma(arg2, arg3, arg4, arg5); break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 137d4abe3eda..1c240d2c99bc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -425,21 +425,6 @@ static void proc_put_char(void **buf, size_t *size, char c) } } -static int do_proc_dobool_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - if (write) { - *(bool *)valp = *lvalp; - } else { - int val = *(bool *)valp; - - *lvalp = (unsigned long)val; - *negp = false; - } - return 0; -} - static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) @@ -710,16 +695,36 @@ int do_proc_douintvec(struct ctl_table *table, int write, * @lenp: the size of the user buffer * @ppos: file position * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. + * Reads/writes one integer value from/to the user buffer, + * treated as an ASCII string. + * + * table->data must point to a bool variable and table->maxlen must + * be sizeof(bool). * * Returns 0 on success. */ int proc_dobool(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table, write, buffer, lenp, ppos, - do_proc_dobool_conv, NULL); + struct ctl_table tmp; + bool *data = table->data; + int res, val; + + /* Do not support arrays yet. */ + if (table->maxlen != sizeof(bool)) + return -EINVAL; + + tmp = *table; + tmp.maxlen = sizeof(val); + tmp.data = &val; + + val = READ_ONCE(*data); + res = proc_dointvec(&tmp, write, buffer, lenp, ppos); + if (res) + return res; + if (write) + WRITE_ONCE(*data, val); + return 0; } /** diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index caf32389faf3..a856d4a34c67 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -242,7 +242,7 @@ config DYNAMIC_FTRACE enabled, and the functions not enabled will not affect performance of the system. - See the files in /sys/kernel/debug/tracing: + See the files in /sys/kernel/tracing: available_filter_functions set_ftrace_filter set_ftrace_notrace @@ -306,7 +306,7 @@ config STACK_TRACER select KALLSYMS help This special tracer records the maximum stack footprint of the - kernel and displays it in /sys/kernel/debug/tracing/stack_trace. + kernel and displays it in /sys/kernel/tracing/stack_trace. This tracer works by hooking into every function call that the kernel executes, and keeping a maximum stack depth value and @@ -346,7 +346,7 @@ config IRQSOFF_TRACER disabled by default and can be runtime (re-)started via: - echo 0 > /sys/kernel/debug/tracing/tracing_max_latency + echo 0 > /sys/kernel/tracing/tracing_max_latency (Note that kernel size and overhead increase with this option enabled. This option and the preempt-off timing option can be @@ -370,7 +370,7 @@ config PREEMPT_TRACER disabled by default and can be runtime (re-)started via: - echo 0 > /sys/kernel/debug/tracing/tracing_max_latency + echo 0 > /sys/kernel/tracing/tracing_max_latency (Note that kernel size and overhead increase with this option enabled. This option and the irqs-off timing option can be @@ -522,7 +522,7 @@ config TRACER_SNAPSHOT Allow tracing users to take snapshot of the current buffer using the ftrace interface, e.g.: - echo 1 > /sys/kernel/debug/tracing/snapshot + echo 1 > /sys/kernel/tracing/snapshot cat snapshot config TRACER_SNAPSHOT_PER_CPU_SWAP @@ -534,7 +534,7 @@ config TRACER_SNAPSHOT_PER_CPU_SWAP full swap (all buffers). If this is set, then the following is allowed: - echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot + echo 1 > /sys/kernel/tracing/per_cpu/cpu2/snapshot After which, only the tracing buffer for CPU 2 was swapped with the main tracing buffer, and the other CPU buffers remain the same. @@ -581,7 +581,7 @@ config PROFILE_ANNOTATED_BRANCHES This tracer profiles all likely and unlikely macros in the kernel. It will display the results in: - /sys/kernel/debug/tracing/trace_stat/branch_annotated + /sys/kernel/tracing/trace_stat/branch_annotated Note: this will add a significant overhead; only turn this on if you need to profile the system's use of these macros. @@ -594,7 +594,7 @@ config PROFILE_ALL_BRANCHES taken in the kernel is recorded whether it hit or miss. The results will be displayed in: - /sys/kernel/debug/tracing/trace_stat/branch_all + /sys/kernel/tracing/trace_stat/branch_all This option also enables the likely/unlikely profiler. @@ -645,8 +645,8 @@ config BLK_DEV_IO_TRACE Tracing also is possible using the ftrace interface, e.g.: echo 1 > /sys/block/sda/sda1/trace/enable - echo blk > /sys/kernel/debug/tracing/current_tracer - cat /sys/kernel/debug/tracing/trace_pipe + echo blk > /sys/kernel/tracing/current_tracer + cat /sys/kernel/tracing/trace_pipe If unsure, say N. diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 5743be559415..d5d94510afd3 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -729,14 +729,10 @@ EXPORT_SYMBOL_GPL(blk_trace_startstop); **/ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) { - struct request_queue *q; + struct request_queue *q = bdev_get_queue(bdev); int ret, start = 0; char b[BDEVNAME_SIZE]; - q = bdev_get_queue(bdev); - if (!q) - return -ENXIO; - mutex_lock(&q->debugfs_mutex); switch (cmd) { diff --git a/kernel/trace/kprobe_event_gen_test.c b/kernel/trace/kprobe_event_gen_test.c index c736487fc0e4..4850fdfe27f1 100644 --- a/kernel/trace/kprobe_event_gen_test.c +++ b/kernel/trace/kprobe_event_gen_test.c @@ -21,7 +21,7 @@ * Then: * * # insmod kernel/trace/kprobe_event_gen_test.ko - * # cat /sys/kernel/debug/tracing/trace + * # cat /sys/kernel/tracing/trace * * You should see many instances of the "gen_kprobe_test" and * "gen_kretprobe_test" events in the trace buffer. diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c366a0a9ddba..af50d931b020 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1581,19 +1581,6 @@ static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, } /** - * rb_check_list - make sure a pointer to a list has the last bits zero - */ -static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer, - struct list_head *list) -{ - if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev)) - return 1; - if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next)) - return 1; - return 0; -} - -/** * rb_check_pages - integrity check of buffer pages * @cpu_buffer: CPU buffer with pages to test * @@ -1602,36 +1589,27 @@ static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer, */ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) { - struct list_head *head = cpu_buffer->pages; - struct buffer_page *bpage, *tmp; - - /* Reset the head page if it exists */ - if (cpu_buffer->head_page) - rb_set_head_page(cpu_buffer); - - rb_head_page_deactivate(cpu_buffer); + struct list_head *head = rb_list_head(cpu_buffer->pages); + struct list_head *tmp; - if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) - return -1; - if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) + if (RB_WARN_ON(cpu_buffer, + rb_list_head(rb_list_head(head->next)->prev) != head)) return -1; - if (rb_check_list(cpu_buffer, head)) + if (RB_WARN_ON(cpu_buffer, + rb_list_head(rb_list_head(head->prev)->next) != head)) return -1; - list_for_each_entry_safe(bpage, tmp, head, list) { + for (tmp = rb_list_head(head->next); tmp != head; tmp = rb_list_head(tmp->next)) { if (RB_WARN_ON(cpu_buffer, - bpage->list.next->prev != &bpage->list)) + rb_list_head(rb_list_head(tmp->next)->prev) != tmp)) return -1; + if (RB_WARN_ON(cpu_buffer, - bpage->list.prev->next != &bpage->list)) - return -1; - if (rb_check_list(cpu_buffer, &bpage->list)) + rb_list_head(rb_list_head(tmp->prev)->next) != tmp)) return -1; } - rb_head_page_activate(cpu_buffer); - return 0; } @@ -2886,7 +2864,7 @@ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, sched_clock_stable() ? "" : "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" - " echo global > /sys/kernel/debug/tracing/trace_clock\n" + " echo global > /sys/kernel/tracing/trace_clock\n" "or add trace_clock=global to the kernel command line\n"); } @@ -5626,11 +5604,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); */ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data) { - struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct ring_buffer_per_cpu *cpu_buffer; struct buffer_data_page *bpage = data; struct page *page = virt_to_page(bpage); unsigned long flags; + if (!buffer || !buffer->buffers || !buffer->buffers[cpu]) + return; + + cpu_buffer = buffer->buffers[cpu]; + /* If the page is still in use someplace else, we can't reuse it */ if (page_ref_count(page) > 1) goto out; diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index 8d77526892f4..8dfe85499d4a 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -22,7 +22,7 @@ * Then: * * # insmod kernel/trace/synth_event_gen_test.ko - * # cat /sys/kernel/debug/tracing/trace + * # cat /sys/kernel/tracing/trace * * You should see several events in the trace buffer - * "create_synth_test", "empty_synth_test", and several instances of diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 54a163ae4815..45551c7b4c36 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -49,6 +49,8 @@ #include <linux/irq_work.h> #include <linux/workqueue.h> +#include <asm/setup.h> /* COMMAND_LINE_SIZE */ + #include "trace.h" #include "trace_output.h" @@ -186,6 +188,12 @@ static char *default_bootup_tracer; static bool allocate_snapshot; static bool snapshot_at_boot; +static char boot_instance_info[COMMAND_LINE_SIZE] __initdata; +static int boot_instance_index; + +static char boot_snapshot_info[COMMAND_LINE_SIZE] __initdata; +static int boot_snapshot_index; + static int __init set_cmdline_ftrace(char *str) { strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); @@ -222,9 +230,22 @@ __setup("traceoff_on_warning", stop_trace_on_warning); static int __init boot_alloc_snapshot(char *str) { - allocate_snapshot = true; - /* We also need the main ring buffer expanded */ - ring_buffer_expanded = true; + char *slot = boot_snapshot_info + boot_snapshot_index; + int left = sizeof(boot_snapshot_info) - boot_snapshot_index; + int ret; + + if (str[0] == '=') { + str++; + if (strlen(str) >= left) + return -1; + + ret = snprintf(slot, left, "%s\t", str); + boot_snapshot_index += ret; + } else { + allocate_snapshot = true; + /* We also need the main ring buffer expanded */ + ring_buffer_expanded = true; + } return 1; } __setup("alloc_snapshot", boot_alloc_snapshot); @@ -239,6 +260,23 @@ static int __init boot_snapshot(char *str) __setup("ftrace_boot_snapshot", boot_snapshot); +static int __init boot_instance(char *str) +{ + char *slot = boot_instance_info + boot_instance_index; + int left = sizeof(boot_instance_info) - boot_instance_index; + int ret; + + if (strlen(str) >= left) + return -1; + + ret = snprintf(slot, left, "%s\t", str); + boot_instance_index += ret; + + return 1; +} +__setup("trace_instance=", boot_instance); + + static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; static int __init set_trace_boot_options(char *str) @@ -1001,13 +1039,8 @@ __buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *ev ring_buffer_unlock_commit(buffer); } -/** - * __trace_puts - write a constant string into the trace buffer. - * @ip: The address of the caller - * @str: The constant string to write - * @size: The size of the string. - */ -int __trace_puts(unsigned long ip, const char *str, int size) +int __trace_array_puts(struct trace_array *tr, unsigned long ip, + const char *str, int size) { struct ring_buffer_event *event; struct trace_buffer *buffer; @@ -1015,7 +1048,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) unsigned int trace_ctx; int alloc; - if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) + if (!(tr->trace_flags & TRACE_ITER_PRINTK)) return 0; if (unlikely(tracing_selftest_running || tracing_disabled)) @@ -1024,7 +1057,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) alloc = sizeof(*entry) + size + 2; /* possible \n added */ trace_ctx = tracing_gen_ctx(); - buffer = global_trace.array_buffer.buffer; + buffer = tr->array_buffer.buffer; ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, trace_ctx); @@ -1046,11 +1079,23 @@ int __trace_puts(unsigned long ip, const char *str, int size) entry->buf[size] = '\0'; __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL); + ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); out: ring_buffer_nest_end(buffer); return size; } +EXPORT_SYMBOL_GPL(__trace_array_puts); + +/** + * __trace_puts - write a constant string into the trace buffer. + * @ip: The address of the caller + * @str: The constant string to write + * @size: The size of the string. + */ +int __trace_puts(unsigned long ip, const char *str, int size) +{ + return __trace_array_puts(&global_trace, ip, str, size); +} EXPORT_SYMBOL_GPL(__trace_puts); /** @@ -1142,7 +1187,7 @@ void tracing_snapshot_instance(struct trace_array *tr) * * Note, make sure to allocate the snapshot with either * a tracing_snapshot_alloc(), or by doing it manually - * with: echo 1 > /sys/kernel/debug/tracing/snapshot + * with: echo 1 > /sys/kernel/tracing/snapshot * * If the snapshot buffer is not allocated, it will stop tracing. * Basically making a permanent snapshot. @@ -5601,7 +5646,7 @@ static const char readme_msg[] = #ifdef CONFIG_HIST_TRIGGERS "\t s:[synthetic/]<event> <field> [<field>]\n" #endif - "\t e[:[<group>/][<event>]] <attached-group>.<attached-event> [<args>]\n" + "\t e[:[<group>/][<event>]] <attached-group>.<attached-event> [<args>] [if <filter>]\n" "\t -:[<group>/][<event>]\n" #ifdef CONFIG_KPROBE_EVENTS "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" @@ -5618,7 +5663,7 @@ static const char readme_msg[] = "\t $stack<index>, $stack, $retval, $comm,\n" #endif "\t +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n" - "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n" + "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n" "\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n" "\t symstr, <type>\\[<array-size>\\]\n" #ifdef CONFIG_HIST_TRIGGERS @@ -5760,7 +5805,7 @@ static const char readme_msg[] = #ifdef CONFIG_SYNTH_EVENTS " events/synthetic_events\t- Create/append/remove/show synthetic events\n" "\t Write into this file to define/undefine new synthetic events.\n" - "\t example: echo 'myevent u64 lat; char name[]' >> synthetic_events\n" + "\t example: echo 'myevent u64 lat; char name[]; long[] stack' >> synthetic_events\n" #endif #endif ; @@ -9225,10 +9270,6 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) } tr->allocated_snapshot = allocate_snapshot; - /* - * Only the top level trace array gets its snapshot allocated - * from the kernel command line. - */ allocate_snapshot = false; #endif @@ -10144,6 +10185,79 @@ out: return ret; } +#ifdef CONFIG_TRACER_MAX_TRACE +__init static bool tr_needs_alloc_snapshot(const char *name) +{ + char *test; + int len = strlen(name); + bool ret; + + if (!boot_snapshot_index) + return false; + + if (strncmp(name, boot_snapshot_info, len) == 0 && + boot_snapshot_info[len] == '\t') + return true; + + test = kmalloc(strlen(name) + 3, GFP_KERNEL); + if (!test) + return false; + + sprintf(test, "\t%s\t", name); + ret = strstr(boot_snapshot_info, test) == NULL; + kfree(test); + return ret; +} + +__init static void do_allocate_snapshot(const char *name) +{ + if (!tr_needs_alloc_snapshot(name)) + return; + + /* + * When allocate_snapshot is set, the next call to + * allocate_trace_buffers() (called by trace_array_get_by_name()) + * will allocate the snapshot buffer. That will alse clear + * this flag. + */ + allocate_snapshot = true; +} +#else +static inline void do_allocate_snapshot(const char *name) { } +#endif + +__init static void enable_instances(void) +{ + struct trace_array *tr; + char *curr_str; + char *str; + char *tok; + + /* A tab is always appended */ + boot_instance_info[boot_instance_index - 1] = '\0'; + str = boot_instance_info; + + while ((curr_str = strsep(&str, "\t"))) { + + tok = strsep(&curr_str, ","); + + if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE)) + do_allocate_snapshot(tok); + + tr = trace_array_get_by_name(tok); + if (!tr) { + pr_warn("Failed to create instance buffer %s\n", curr_str); + continue; + } + /* Allow user space to delete it */ + trace_array_put(tr); + + while ((tok = strsep(&curr_str, ","))) { + early_enable_events(tr, tok, true); + } + } +} + __init static int tracer_alloc_buffers(void) { int ring_buf_size; @@ -10277,10 +10391,19 @@ out: void __init ftrace_boot_snapshot(void) { + struct trace_array *tr; + if (snapshot_at_boot) { tracing_snapshot(); internal_trace_puts("** Boot snapshot taken **\n"); } + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr == &global_trace) + continue; + trace_array_puts(tr, "** Boot snapshot taken **\n"); + tracing_snapshot_instance(tr); + } } void __init early_trace_init(void) @@ -10302,6 +10425,9 @@ void __init early_trace_init(void) void __init trace_init(void) { trace_event_init(); + + if (boot_instance_index) + enable_instances(); } __init static void clear_boot_tracer(void) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 085a31b978a5..616e1aa1c4da 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -25,7 +25,7 @@ #include "pid_list.h" #ifdef CONFIG_FTRACE_SYSCALLS -#include <asm/unistd.h> /* For NR_SYSCALLS */ +#include <asm/unistd.h> /* For NR_syscalls */ #include <asm/syscall.h> /* some archs define it here */ #endif @@ -113,6 +113,10 @@ enum trace_type { #define MEM_FAIL(condition, fmt, ...) \ DO_ONCE_LITE_IF(condition, pr_err, "ERROR: " fmt, ##__VA_ARGS__) +#define HIST_STACKTRACE_DEPTH 16 +#define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long)) +#define HIST_STACKTRACE_SKIP 5 + /* * syscalls are special, and need special handling, this is why * they are not included in trace_entries.h @@ -1331,6 +1335,8 @@ DECLARE_PER_CPU(int, trace_buffered_event_cnt); void trace_buffered_event_disable(void); void trace_buffered_event_enable(void); +void early_enable_events(struct trace_array *tr, char *buf, bool disable_first); + static inline void __trace_event_discard_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 352b65e2b910..67e854979d53 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -311,7 +311,7 @@ print_eprobe_event(struct trace_iterator *iter, int flags, trace_seq_putc(s, ')'); - if (print_probe_args(s, tp->args, tp->nr_args, + if (trace_probe_print_args(s, tp->args, tp->nr_args, (u8 *)&field[1], field) < 0) goto out; @@ -320,7 +320,8 @@ print_eprobe_event(struct trace_iterator *iter, int flags, return trace_handle_return(s); } -static unsigned long get_event_field(struct fetch_insn *code, void *rec) +static nokprobe_inline unsigned long +get_event_field(struct fetch_insn *code, void *rec) { struct ftrace_event_field *field = code->data; unsigned long val; @@ -395,20 +396,12 @@ static int get_eprobe_size(struct trace_probe *tp, void *rec) case FETCH_OP_TP_ARG: val = get_event_field(code, rec); break; - case FETCH_OP_IMM: - val = code->immediate; - break; - case FETCH_OP_COMM: - val = (unsigned long)current->comm; - break; - case FETCH_OP_DATA: - val = (unsigned long)code->data; - break; case FETCH_NOP_SYMBOL: /* Ignore a place holder */ code++; goto retry; default: - continue; + if (process_common_fetch_insn(code, &val) < 0) + continue; } code++; len = process_fetch_insn_bottom(code, val, NULL, NULL); @@ -428,84 +421,26 @@ process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, void *base) { unsigned long val; + int ret; retry: switch (code->op) { case FETCH_OP_TP_ARG: val = get_event_field(code, rec); break; - case FETCH_OP_IMM: - val = code->immediate; - break; - case FETCH_OP_COMM: - val = (unsigned long)current->comm; - break; - case FETCH_OP_DATA: - val = (unsigned long)code->data; - break; case FETCH_NOP_SYMBOL: /* Ignore a place holder */ code++; goto retry; default: - return -EILSEQ; + ret = process_common_fetch_insn(code, &val); + if (ret < 0) + return ret; } code++; return process_fetch_insn_bottom(code, val, dest, base); } NOKPROBE_SYMBOL(process_fetch_insn) -/* Return the length of string -- including null terminal byte */ -static nokprobe_inline int -fetch_store_strlen_user(unsigned long addr) -{ - return kern_fetch_store_strlen_user(addr); -} - -/* Return the length of string -- including null terminal byte */ -static nokprobe_inline int -fetch_store_strlen(unsigned long addr) -{ - return kern_fetch_store_strlen(addr); -} - -/* - * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf - * with max length and relative data location. - */ -static nokprobe_inline int -fetch_store_string_user(unsigned long addr, void *dest, void *base) -{ - return kern_fetch_store_string_user(addr, dest, base); -} - -/* - * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max - * length and relative data location. - */ -static nokprobe_inline int -fetch_store_string(unsigned long addr, void *dest, void *base) -{ - return kern_fetch_store_string(addr, dest, base); -} - -static nokprobe_inline int -probe_mem_read_user(void *dest, void *src, size_t size) -{ - const void __user *uaddr = (__force const void __user *)src; - - return copy_from_user_nofault(dest, uaddr, size); -} - -static nokprobe_inline int -probe_mem_read(void *dest, void *src, size_t size) -{ -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - if ((unsigned long)src < TASK_SIZE) - return probe_mem_read_user(dest, src, size); -#endif - return copy_from_kernel_nofault(dest, src, size); -} - /* eprobe handler */ static inline void __eprobe_trace_func(struct eprobe_data *edata, void *rec) @@ -923,17 +858,13 @@ static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const ch p = ep->filter_str; for (i = 0; i < argc; i++) { - ret = snprintf(p, len, "%s ", argv[i]); - if (ret < 0) - goto error; - if (ret > len) { - ret = -E2BIG; - goto error; - } + if (i) + ret = snprintf(p, len, " %s", argv[i]); + else + ret = snprintf(p, len, "%s", argv[i]); p += ret; len -= ret; } - p[-1] = '\0'; /* * Ensure the filter string can be parsed correctly. Note, this diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6a942fa275c7..654ffa40457a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2281,8 +2281,6 @@ create_new_subsystem(const char *name) if (!system->name) goto out_free; - system->filter = NULL; - system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); if (!system->filter) goto out_free; @@ -2843,7 +2841,7 @@ static __init int setup_trace_triggers(char *str) if (!trigger) break; bootup_triggers[i].event = strsep(&trigger, "."); - bootup_triggers[i].trigger = strsep(&trigger, "."); + bootup_triggers[i].trigger = trigger; if (!bootup_triggers[i].trigger) break; } @@ -3771,10 +3769,9 @@ static __init int event_trace_memsetup(void) return 0; } -static __init void -early_enable_events(struct trace_array *tr, bool disable_first) +__init void +early_enable_events(struct trace_array *tr, char *buf, bool disable_first) { - char *buf = bootup_event_buf; char *token; int ret; @@ -3827,7 +3824,7 @@ static __init int event_trace_enable(void) */ __trace_early_add_events(tr); - early_enable_events(tr, false); + early_enable_events(tr, bootup_event_buf, false); trace_printk_start_comm(); @@ -3855,7 +3852,7 @@ static __init int event_trace_enable_again(void) if (!tr) return -ENODEV; - early_enable_events(tr, true); + early_enable_events(tr, bootup_event_buf, true); return 0; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e095c3b3a50d..1dad64267878 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -64,6 +64,7 @@ enum filter_pred_fn { FILTER_PRED_FN_PCHAR_USER, FILTER_PRED_FN_PCHAR, FILTER_PRED_FN_CPU, + FILTER_PRED_FN_FUNCTION, FILTER_PRED_FN_, FILTER_PRED_TEST_VISITED, }; @@ -71,6 +72,7 @@ enum filter_pred_fn { struct filter_pred { enum filter_pred_fn fn_num; u64 val; + u64 val2; struct regex regex; unsigned short *ops; struct ftrace_event_field *field; @@ -103,6 +105,7 @@ struct filter_pred { C(INVALID_FILTER, "Meaningless filter expression"), \ C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \ C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), \ + C(NO_FUNCTION, "Function not found"), \ C(ERRNO, "Error"), \ C(NO_FILTER, "No filter found") @@ -876,6 +879,17 @@ static int filter_pred_comm(struct filter_pred *pred, void *event) return cmp ^ pred->not; } +/* Filter predicate for functions. */ +static int filter_pred_function(struct filter_pred *pred, void *event) +{ + unsigned long *addr = (unsigned long *)(event + pred->offset); + unsigned long start = (unsigned long)pred->val; + unsigned long end = (unsigned long)pred->val2; + int ret = *addr >= start && *addr < end; + + return pred->op == OP_EQ ? ret : !ret; +} + /* * regex_match_foo - Basic regex callbacks * @@ -1335,6 +1349,8 @@ static int filter_pred_fn_call(struct filter_pred *pred, void *event) return filter_pred_pchar(pred, event); case FILTER_PRED_FN_CPU: return filter_pred_cpu(pred, event); + case FILTER_PRED_FN_FUNCTION: + return filter_pred_function(pred, event); case FILTER_PRED_TEST_VISITED: return test_pred_visited_fn(pred, event); default: @@ -1350,8 +1366,13 @@ static int parse_pred(const char *str, void *data, struct trace_event_call *call = data; struct ftrace_event_field *field; struct filter_pred *pred = NULL; + unsigned long offset; + unsigned long size; + unsigned long ip; char num_buf[24]; /* Big enough to hold an address */ char *field_name; + char *name; + bool function = false; bool ustring = false; char q; u64 val; @@ -1393,6 +1414,12 @@ static int parse_pred(const char *str, void *data, i += len; } + /* See if the field is a kernel function name */ + if ((len = str_has_prefix(str + i, ".function"))) { + function = true; + i += len; + } + while (isspace(str[i])) i++; @@ -1423,7 +1450,71 @@ static int parse_pred(const char *str, void *data, pred->offset = field->offset; pred->op = op; - if (ftrace_event_is_function(call)) { + if (function) { + /* The field must be the same size as long */ + if (field->size != sizeof(long)) { + parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i); + goto err_free; + } + + /* Function only works with '==' or '!=' and an unquoted string */ + switch (op) { + case OP_NE: + case OP_EQ: + break; + default: + parse_error(pe, FILT_ERR_INVALID_OP, pos + i); + goto err_free; + } + + if (isdigit(str[i])) { + /* We allow 0xDEADBEEF */ + while (isalnum(str[i])) + i++; + + len = i - s; + /* 0xfeedfacedeadbeef is 18 chars max */ + if (len >= sizeof(num_buf)) { + parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); + goto err_free; + } + + strncpy(num_buf, str + s, len); + num_buf[len] = 0; + + ret = kstrtoul(num_buf, 0, &ip); + if (ret) { + parse_error(pe, FILT_ERR_INVALID_VALUE, pos + i); + goto err_free; + } + } else { + s = i; + for (; str[i] && !isspace(str[i]); i++) + ; + + len = i - s; + name = kmemdup_nul(str + s, len, GFP_KERNEL); + if (!name) + goto err_mem; + ip = kallsyms_lookup_name(name); + kfree(name); + if (!ip) { + parse_error(pe, FILT_ERR_NO_FUNCTION, pos + i); + goto err_free; + } + } + + /* Now find the function start and end address */ + if (!kallsyms_lookup_size_offset(ip, &size, &offset)) { + parse_error(pe, FILT_ERR_NO_FUNCTION, pos + i); + goto err_free; + } + + pred->fn_num = FILTER_PRED_FN_FUNCTION; + pred->val = ip - offset; + pred->val2 = pred->val + size; + + } else if (ftrace_event_is_function(call)) { /* * Perf does things different with function events. * It only allows an "ip" field, and expects a string. diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5edbf6b1da3f..89877a18f933 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -135,6 +135,7 @@ enum hist_field_fn { HIST_FIELD_FN_DIV_NOT_POWER2, HIST_FIELD_FN_DIV_MULT_SHIFT, HIST_FIELD_FN_EXECNAME, + HIST_FIELD_FN_STACK, }; /* @@ -480,10 +481,6 @@ DEFINE_HIST_FIELD_FN(u8); #define for_each_hist_key_field(i, hist_data) \ for ((i) = (hist_data)->n_vals; (i) < (hist_data)->n_fields; (i)++) -#define HIST_STACKTRACE_DEPTH 16 -#define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long)) -#define HIST_STACKTRACE_SKIP 5 - #define HITCOUNT_IDX 0 #define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE) @@ -1360,7 +1357,12 @@ static const char *hist_field_name(struct hist_field *field, field_name = field->name; } else if (field->flags & HIST_FIELD_FL_TIMESTAMP) field_name = "common_timestamp"; - else if (field->flags & HIST_FIELD_FL_HITCOUNT) + else if (field->flags & HIST_FIELD_FL_STACKTRACE) { + if (field->field) + field_name = field->field->name; + else + field_name = "stacktrace"; + } else if (field->flags & HIST_FIELD_FL_HITCOUNT) field_name = "hitcount"; if (field_name == NULL) @@ -1718,6 +1720,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) flags_str = "percent"; else if (hist_field->flags & HIST_FIELD_FL_GRAPH) flags_str = "graph"; + else if (hist_field->flags & HIST_FIELD_FL_STACKTRACE) + flags_str = "stacktrace"; return flags_str; } @@ -1979,7 +1983,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, } if (flags & HIST_FIELD_FL_STACKTRACE) { - hist_field->fn_num = HIST_FIELD_FN_NOP; + if (field) + hist_field->fn_num = HIST_FIELD_FN_STACK; + else + hist_field->fn_num = HIST_FIELD_FN_NOP; + hist_field->size = HIST_STACKTRACE_SIZE; + hist_field->type = kstrdup_const("unsigned long[]", GFP_KERNEL); + if (!hist_field->type) + goto free; goto out; } @@ -2312,6 +2323,8 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, *flags |= HIST_FIELD_FL_EXECNAME; else if (strcmp(modifier, "syscall") == 0) *flags |= HIST_FIELD_FL_SYSCALL; + else if (strcmp(modifier, "stacktrace") == 0) + *flags |= HIST_FIELD_FL_STACKTRACE; else if (strcmp(modifier, "log2") == 0) *flags |= HIST_FIELD_FL_LOG2; else if (strcmp(modifier, "usecs") == 0) @@ -2351,6 +2364,8 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, hist_data->enable_timestamps = true; if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS) hist_data->attrs->ts_in_usecs = true; + } else if (strcmp(field_name, "stacktrace") == 0) { + *flags |= HIST_FIELD_FL_STACKTRACE; } else if (strcmp(field_name, "common_cpu") == 0) *flags |= HIST_FIELD_FL_CPU; else if (strcmp(field_name, "hitcount") == 0) @@ -3111,6 +3126,9 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, unsigned int i, j, var_idx; u64 var_val; + /* Make sure stacktrace can fit in the string variable length */ + BUILD_BUG_ON((HIST_STACKTRACE_DEPTH + 1) * sizeof(long) >= STR_VAR_LEN_MAX); + for (i = 0, j = field_var_str_start; i < n_field_vars; i++) { struct field_var *field_var = field_vars[i]; struct hist_field *var = field_var->var; @@ -3119,13 +3137,26 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, var_val = hist_fn_call(val, elt, buffer, rbe, rec); var_idx = var->var.idx; - if (val->flags & HIST_FIELD_FL_STRING) { + if (val->flags & (HIST_FIELD_FL_STRING | + HIST_FIELD_FL_STACKTRACE)) { char *str = elt_data->field_var_str[j++]; char *val_str = (char *)(uintptr_t)var_val; unsigned int size; - size = min(val->size, STR_VAR_LEN_MAX); - strscpy(str, val_str, size); + if (val->flags & HIST_FIELD_FL_STRING) { + size = min(val->size, STR_VAR_LEN_MAX); + strscpy(str, val_str, size); + } else { + char *stack_start = str + sizeof(unsigned long); + int e; + + e = stack_trace_save((void *)stack_start, + HIST_STACKTRACE_DEPTH, + HIST_STACKTRACE_SKIP); + if (e < HIST_STACKTRACE_DEPTH - 1) + ((unsigned long *)stack_start)[e] = 0; + *((unsigned long *)str) = e; + } var_val = (u64)(uintptr_t)str; } tracing_map_set_var(elt, var_idx, var_val); @@ -3824,7 +3855,8 @@ static void save_field_var(struct hist_trigger_data *hist_data, { hist_data->field_vars[hist_data->n_field_vars++] = field_var; - if (field_var->val->flags & HIST_FIELD_FL_STRING) + /* Stack traces are saved in the string storage too */ + if (field_var->val->flags & (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE)) hist_data->n_field_var_str++; } @@ -3849,6 +3881,9 @@ static int check_synth_field(struct synth_event *event, && field->is_dynamic) return 0; + if (strstr(hist_field->type, "long[") && field->is_stack) + return 0; + if (strcmp(field->type, hist_field->type) != 0) { if (field->size != hist_field->size || (!field->is_string && field->is_signed != hist_field->is_signed)) @@ -4103,7 +4138,8 @@ static int action_create(struct hist_trigger_data *hist_data, } hist_data->save_vars[hist_data->n_save_vars++] = field_var; - if (field_var->val->flags & HIST_FIELD_FL_STRING) + if (field_var->val->flags & + (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE)) hist_data->n_save_var_str++; kfree(param); } @@ -4242,6 +4278,19 @@ static u64 hist_field_execname(struct hist_field *hist_field, return (u64)(unsigned long)(elt_data->comm); } +static u64 hist_field_stack(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + u32 str_item = *(u32 *)(event + hist_field->field->offset); + int str_loc = str_item & 0xffff; + char *addr = (char *)(event + str_loc); + + return (u64)(unsigned long)addr; +} + static u64 hist_fn_call(struct hist_field *hist_field, struct tracing_map_elt *elt, struct trace_buffer *buffer, @@ -4305,6 +4354,8 @@ static u64 hist_fn_call(struct hist_field *hist_field, return div_by_mult_and_shift(hist_field, elt, buffer, rbe, event); case HIST_FIELD_FN_EXECNAME: return hist_field_execname(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_STACK: + return hist_field_stack(hist_field, elt, buffer, rbe, event); default: return 0; } @@ -4351,7 +4402,8 @@ static int create_var_field(struct hist_trigger_data *hist_data, if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_EXECNAME) update_var_execname(hist_data->fields[val_idx]); - if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_STRING) + if (!ret && hist_data->fields[val_idx]->flags & + (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE)) hist_data->fields[val_idx]->var_str_idx = hist_data->n_var_str++; return ret; @@ -5092,7 +5144,8 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, if (hist_field->flags & HIST_FIELD_FL_VAR) { var_idx = hist_field->var.idx; - if (hist_field->flags & HIST_FIELD_FL_STRING) { + if (hist_field->flags & + (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE)) { unsigned int str_start, var_str_idx, idx; char *str, *val_str; unsigned int size; @@ -5105,9 +5158,20 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, str = elt_data->field_var_str[idx]; val_str = (char *)(uintptr_t)hist_val; - size = min(hist_field->size, STR_VAR_LEN_MAX); - strscpy(str, val_str, size); - + if (hist_field->flags & HIST_FIELD_FL_STRING) { + size = min(hist_field->size, STR_VAR_LEN_MAX); + strscpy(str, val_str, size); + } else { + char *stack_start = str + sizeof(unsigned long); + int e; + + e = stack_trace_save((void *)stack_start, + HIST_STACKTRACE_DEPTH, + HIST_STACKTRACE_SKIP); + if (e < HIST_STACKTRACE_DEPTH - 1) + ((unsigned long *)stack_start)[e] = 0; + *((unsigned long *)str) = e; + } hist_val = (u64)(uintptr_t)str; } tracing_map_set_var(elt, var_idx, hist_val); @@ -5193,8 +5257,17 @@ static void event_hist_trigger(struct event_trigger_data *data, if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { memset(entries, 0, HIST_STACKTRACE_SIZE); - stack_trace_save(entries, HIST_STACKTRACE_DEPTH, - HIST_STACKTRACE_SKIP); + if (key_field->field) { + unsigned long *stack, n_entries; + + field_contents = hist_fn_call(key_field, elt, buffer, rbe, rec); + stack = (unsigned long *)(long)field_contents; + n_entries = *stack; + memcpy(entries, ++stack, n_entries * sizeof(unsigned long)); + } else { + stack_trace_save(entries, HIST_STACKTRACE_DEPTH, + HIST_STACKTRACE_SKIP); + } key = entries; } else { field_contents = hist_fn_call(key_field, elt, buffer, rbe, rec); @@ -5297,7 +5370,10 @@ static void hist_trigger_print_key(struct seq_file *m, seq_printf(m, "%s: %-30s[%3llu]", field_name, syscall_name, uval); } else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { - seq_puts(m, "stacktrace:\n"); + if (key_field->field) + seq_printf(m, "%s.stacktrace", key_field->field->name); + else + seq_puts(m, "stacktrace:\n"); hist_trigger_stacktrace_print(m, key + key_field->offset, HIST_STACKTRACE_DEPTH); @@ -5842,7 +5918,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) if (hist_field->flags) { if (!(hist_field->flags & HIST_FIELD_FL_VAR_REF) && - !(hist_field->flags & HIST_FIELD_FL_EXPR)) { + !(hist_field->flags & HIST_FIELD_FL_EXPR) && + !(hist_field->flags & HIST_FIELD_FL_STACKTRACE)) { const char *flags = get_hist_field_flags(hist_field); if (flags) @@ -5875,9 +5952,12 @@ static int event_hist_trigger_print(struct seq_file *m, if (i > hist_data->n_vals) seq_puts(m, ","); - if (field->flags & HIST_FIELD_FL_STACKTRACE) - seq_puts(m, "stacktrace"); - else + if (field->flags & HIST_FIELD_FL_STACKTRACE) { + if (field->field) + seq_printf(m, "%s.stacktrace", field->field->name); + else + seq_puts(m, "stacktrace"); + } else hist_field_print(m, field); } diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 67592eed0be8..46d0abb32d0f 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -173,6 +173,14 @@ static int synth_field_is_string(char *type) return false; } +static int synth_field_is_stack(char *type) +{ + if (strstr(type, "long[") != NULL) + return true; + + return false; +} + static int synth_field_string_size(char *type) { char buf[4], *end, *start; @@ -248,6 +256,8 @@ static int synth_field_size(char *type) size = sizeof(gfp_t); else if (synth_field_is_string(type)) size = synth_field_string_size(type); + else if (synth_field_is_stack(type)) + size = 0; return size; } @@ -292,6 +302,8 @@ static const char *synth_field_fmt(char *type) fmt = "%x"; else if (synth_field_is_string(type)) fmt = "%.*s"; + else if (synth_field_is_stack(type)) + fmt = "%s"; return fmt; } @@ -371,6 +383,23 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, i == se->n_fields - 1 ? "" : " "); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } + } else if (se->fields[i]->is_stack) { + u32 offset, data_offset, len; + unsigned long *p, *end; + + offset = (u32)entry->fields[n_u64]; + data_offset = offset & 0xffff; + len = offset >> 16; + + p = (void *)entry + data_offset; + end = (void *)p + len - (sizeof(long) - 1); + + trace_seq_printf(s, "%s=STACK:\n", se->fields[i]->name); + + for (; *p && p < end; p++) + trace_seq_printf(s, "=> %pS\n", (void *)*p); + n_u64++; + } else { struct trace_print_flags __flags[] = { __def_gfpflag_names, {-1, NULL} }; @@ -416,16 +445,15 @@ static unsigned int trace_string(struct synth_trace_event *entry, if (is_dynamic) { u32 data_offset; - data_offset = offsetof(typeof(*entry), fields); - data_offset += event->n_u64 * sizeof(u64); + data_offset = struct_size(entry, fields, event->n_u64); data_offset += data_size; - len = kern_fetch_store_strlen((unsigned long)str_val); + len = fetch_store_strlen((unsigned long)str_val); data_offset |= len << 16; *(u32 *)&entry->fields[*n_u64] = data_offset; - ret = kern_fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry); + ret = fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry); (*n_u64)++; } else { @@ -447,6 +475,43 @@ static unsigned int trace_string(struct synth_trace_event *entry, return len; } +static unsigned int trace_stack(struct synth_trace_event *entry, + struct synth_event *event, + long *stack, + unsigned int data_size, + unsigned int *n_u64) +{ + unsigned int len; + u32 data_offset; + void *data_loc; + + data_offset = struct_size(entry, fields, event->n_u64); + data_offset += data_size; + + for (len = 0; len < HIST_STACKTRACE_DEPTH; len++) { + if (!stack[len]) + break; + } + + /* Include the zero'd element if it fits */ + if (len < HIST_STACKTRACE_DEPTH) + len++; + + len *= sizeof(long); + + /* Find the dynamic section to copy the stack into. */ + data_loc = (void *)entry + data_offset; + memcpy(data_loc, stack, len); + + /* Fill in the field that holds the offset/len combo */ + data_offset |= len << 16; + *(u32 *)&entry->fields[*n_u64] = data_offset; + + (*n_u64)++; + + return len; +} + static notrace void trace_event_raw_event_synth(void *__data, u64 *var_ref_vals, unsigned int *var_ref_idx) @@ -473,7 +538,12 @@ static notrace void trace_event_raw_event_synth(void *__data, val_idx = var_ref_idx[field_pos]; str_val = (char *)(long)var_ref_vals[val_idx]; - len = kern_fetch_store_strlen((unsigned long)str_val); + if (event->dynamic_fields[i]->is_stack) { + len = *((unsigned long *)str_val); + len *= sizeof(unsigned long); + } else { + len = fetch_store_strlen((unsigned long)str_val); + } fields_size += len; } @@ -499,6 +569,12 @@ static notrace void trace_event_raw_event_synth(void *__data, event->fields[i]->is_dynamic, data_size, &n_u64); data_size += len; /* only dynamic string increments */ + } else if (event->fields[i]->is_stack) { + long *stack = (long *)(long)var_ref_vals[val_idx]; + + len = trace_stack(entry, event, stack, + data_size, &n_u64); + data_size += len; } else { struct synth_field *field = event->fields[i]; u64 val = var_ref_vals[val_idx]; @@ -561,6 +637,9 @@ static int __set_synth_event_print_fmt(struct synth_event *event, event->fields[i]->is_dynamic) pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(%s)", event->fields[i]->name); + else if (event->fields[i]->is_stack) + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", __get_stacktrace(%s)", event->fields[i]->name); else pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", event->fields[i]->name); @@ -697,7 +776,8 @@ static struct synth_field *parse_synth_field(int argc, char **argv, ret = -EINVAL; goto free; } else if (size == 0) { - if (synth_field_is_string(field->type)) { + if (synth_field_is_string(field->type) || + synth_field_is_stack(field->type)) { char *type; len = sizeof("__data_loc ") + strlen(field->type) + 1; @@ -728,6 +808,8 @@ static struct synth_field *parse_synth_field(int argc, char **argv, if (synth_field_is_string(field->type)) field->is_string = true; + else if (synth_field_is_stack(field->type)) + field->is_stack = true; field->is_signed = synth_field_signed(field->type); out: diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ee77c8203bd5..59cda19a9033 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1218,60 +1218,6 @@ static const struct file_operations kprobe_profile_ops = { .release = seq_release, }; -/* Kprobe specific fetch functions */ - -/* Return the length of string -- including null terminal byte */ -static nokprobe_inline int -fetch_store_strlen_user(unsigned long addr) -{ - return kern_fetch_store_strlen_user(addr); -} - -/* Return the length of string -- including null terminal byte */ -static nokprobe_inline int -fetch_store_strlen(unsigned long addr) -{ - return kern_fetch_store_strlen(addr); -} - -/* - * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf - * with max length and relative data location. - */ -static nokprobe_inline int -fetch_store_string_user(unsigned long addr, void *dest, void *base) -{ - return kern_fetch_store_string_user(addr, dest, base); -} - -/* - * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max - * length and relative data location. - */ -static nokprobe_inline int -fetch_store_string(unsigned long addr, void *dest, void *base) -{ - return kern_fetch_store_string(addr, dest, base); -} - -static nokprobe_inline int -probe_mem_read_user(void *dest, void *src, size_t size) -{ - const void __user *uaddr = (__force const void __user *)src; - - return copy_from_user_nofault(dest, uaddr, size); -} - -static nokprobe_inline int -probe_mem_read(void *dest, void *src, size_t size) -{ -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - if ((unsigned long)src < TASK_SIZE) - return probe_mem_read_user(dest, src, size); -#endif - return copy_from_kernel_nofault(dest, src, size); -} - /* Note that we don't verify it, since the code does not come from user space */ static int process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, @@ -1279,6 +1225,7 @@ process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, { struct pt_regs *regs = rec; unsigned long val; + int ret; retry: /* 1st stage: get value from context */ @@ -1295,15 +1242,6 @@ retry: case FETCH_OP_RETVAL: val = regs_return_value(regs); break; - case FETCH_OP_IMM: - val = code->immediate; - break; - case FETCH_OP_COMM: - val = (unsigned long)current->comm; - break; - case FETCH_OP_DATA: - val = (unsigned long)code->data; - break; #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API case FETCH_OP_ARG: val = regs_get_kernel_argument(regs, code->param); @@ -1313,7 +1251,9 @@ retry: code++; goto retry; default: - return -EILSEQ; + ret = process_common_fetch_insn(code, &val); + if (ret < 0) + return ret; } code++; @@ -1424,7 +1364,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags, trace_seq_putc(s, ')'); - if (print_probe_args(s, tp->args, tp->nr_args, + if (trace_probe_print_args(s, tp->args, tp->nr_args, (u8 *)&field[1], field) < 0) goto out; @@ -1459,7 +1399,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, trace_seq_putc(s, ')'); - if (print_probe_args(s, tp->args, tp->nr_args, + if (trace_probe_print_args(s, tp->args, tp->nr_args, (u8 *)&field[1], field) < 0) goto out; diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 210e1f168392..04f0fdae19a1 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1539,7 +1539,7 @@ static void osnoise_sleep(void) wake_time = ktime_add_us(ktime_get(), interval); __set_current_state(TASK_INTERRUPTIBLE); - while (schedule_hrtimeout_range(&wake_time, 0, HRTIMER_MODE_ABS)) { + while (schedule_hrtimeout(&wake_time, HRTIMER_MODE_ABS)) { if (kthread_should_stop()) break; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 01ebabbbe8c9..20d0c4a97633 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -50,6 +50,7 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(x8, u8, "0x%x") DEFINE_BASIC_PRINT_TYPE_FUNC(x16, u16, "0x%x") DEFINE_BASIC_PRINT_TYPE_FUNC(x32, u32, "0x%x") DEFINE_BASIC_PRINT_TYPE_FUNC(x64, u64, "0x%Lx") +DEFINE_BASIC_PRINT_TYPE_FUNC(char, u8, "'%c'") int PRINT_TYPE_FUNC_NAME(symbol)(struct trace_seq *s, void *data, void *ent) { @@ -95,6 +96,7 @@ static const struct fetch_type probe_fetch_types[] = { ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0), + ASSIGN_FETCH_TYPE_ALIAS(char, u8, u8, 0), ASSIGN_FETCH_TYPE_ALIAS(symbol, ADDR_FETCH_TYPE, ADDR_FETCH_TYPE, 0), ASSIGN_FETCH_TYPE_END @@ -1237,3 +1239,30 @@ int trace_probe_create(const char *raw_command, int (*createfn)(int, const char return ret; } + +int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_args, + u8 *data, void *field) +{ + void *p; + int i, j; + + for (i = 0; i < nr_args; i++) { + struct probe_arg *a = args + i; + + trace_seq_printf(s, " %s=", a->name); + if (likely(!a->count)) { + if (!a->type->print(s, data + a->offset, field)) + return -ENOMEM; + continue; + } + trace_seq_putc(s, '{'); + p = data + a->offset; + for (j = 0; j < a->count; j++) { + if (!a->type->print(s, p, field)) + return -ENOMEM; + trace_seq_putc(s, j == a->count - 1 ? '}' : ','); + p += a->type->size; + } + } + return 0; +} diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 23acfd1c3812..ef8ed3b65d05 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -166,6 +166,7 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(x16); DECLARE_BASIC_PRINT_TYPE_FUNC(x32); DECLARE_BASIC_PRINT_TYPE_FUNC(x64); +DECLARE_BASIC_PRINT_TYPE_FUNC(char); DECLARE_BASIC_PRINT_TYPE_FUNC(string); DECLARE_BASIC_PRINT_TYPE_FUNC(symbol); @@ -348,6 +349,8 @@ int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b); bool trace_probe_match_command_args(struct trace_probe *tp, int argc, const char **argv); int trace_probe_create(const char *raw_command, int (*createfn)(int, const char **)); +int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_args, + u8 *data, void *field); #define trace_probe_for_each_link(pos, tp) \ list_for_each_entry(pos, &(tp)->event->files, list) diff --git a/kernel/trace/trace_probe_kernel.h b/kernel/trace/trace_probe_kernel.h index 77dbd9ff9782..c4e1d4c03a85 100644 --- a/kernel/trace/trace_probe_kernel.h +++ b/kernel/trace/trace_probe_kernel.h @@ -12,7 +12,7 @@ */ /* Return the length of string -- including null terminal byte */ static nokprobe_inline int -kern_fetch_store_strlen_user(unsigned long addr) +fetch_store_strlen_user(unsigned long addr) { const void __user *uaddr = (__force const void __user *)addr; int ret; @@ -29,14 +29,14 @@ kern_fetch_store_strlen_user(unsigned long addr) /* Return the length of string -- including null terminal byte */ static nokprobe_inline int -kern_fetch_store_strlen(unsigned long addr) +fetch_store_strlen(unsigned long addr) { int ret, len = 0; u8 c; #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE if (addr < TASK_SIZE) - return kern_fetch_store_strlen_user(addr); + return fetch_store_strlen_user(addr); #endif do { @@ -63,7 +63,7 @@ static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void * with max length and relative data location. */ static nokprobe_inline int -kern_fetch_store_string_user(unsigned long addr, void *dest, void *base) +fetch_store_string_user(unsigned long addr, void *dest, void *base) { const void __user *uaddr = (__force const void __user *)addr; int maxlen = get_loc_len(*(u32 *)dest); @@ -86,7 +86,7 @@ kern_fetch_store_string_user(unsigned long addr, void *dest, void *base) * length and relative data location. */ static nokprobe_inline int -kern_fetch_store_string(unsigned long addr, void *dest, void *base) +fetch_store_string(unsigned long addr, void *dest, void *base) { int maxlen = get_loc_len(*(u32 *)dest); void *__dest; @@ -94,7 +94,7 @@ kern_fetch_store_string(unsigned long addr, void *dest, void *base) #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE if ((unsigned long)addr < TASK_SIZE) - return kern_fetch_store_string_user(addr, dest, base); + return fetch_store_string_user(addr, dest, base); #endif if (unlikely(!maxlen)) @@ -112,4 +112,22 @@ kern_fetch_store_string(unsigned long addr, void *dest, void *base) return ret; } +static nokprobe_inline int +probe_mem_read_user(void *dest, void *src, size_t size) +{ + const void __user *uaddr = (__force const void __user *)src; + + return copy_from_user_nofault(dest, uaddr, size); +} + +static nokprobe_inline int +probe_mem_read(void *dest, void *src, size_t size) +{ +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)src < TASK_SIZE) + return probe_mem_read_user(dest, src, size); +#endif + return copy_from_kernel_nofault(dest, src, size); +} + #endif /* __TRACE_PROBE_KERNEL_H_ */ diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 5cea672243f6..00707630788d 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -98,6 +98,26 @@ fetch_store_symstring(unsigned long addr, void *dest, void *base) return sprint_symbol(__dest, addr); } +/* common part of process_fetch_insn*/ +static nokprobe_inline int +process_common_fetch_insn(struct fetch_insn *code, unsigned long *val) +{ + switch (code->op) { + case FETCH_OP_IMM: + *val = code->immediate; + break; + case FETCH_OP_COMM: + *val = (unsigned long)current->comm; + break; + case FETCH_OP_DATA: + *val = (unsigned long)code->data; + break; + default: + return -EILSEQ; + } + return 0; +} + /* From the 2nd stage, routine is same */ static nokprobe_inline int process_fetch_insn_bottom(struct fetch_insn *code, unsigned long val, @@ -253,31 +273,3 @@ store_trace_args(void *data, struct trace_probe *tp, void *rec, } } } - -static inline int -print_probe_args(struct trace_seq *s, struct probe_arg *args, int nr_args, - u8 *data, void *field) -{ - void *p; - int i, j; - - for (i = 0; i < nr_args; i++) { - struct probe_arg *a = args + i; - - trace_seq_printf(s, " %s=", a->name); - if (likely(!a->count)) { - if (!a->type->print(s, data + a->offset, field)) - return -ENOMEM; - continue; - } - trace_seq_putc(s, '{'); - p = data + a->offset; - for (j = 0; j < a->count; j++) { - if (!a->type->print(s, p, field)) - return -ENOMEM; - trace_seq_putc(s, j == a->count - 1 ? '}' : ','); - p += a->type->size; - } - } - return 0; -} diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index 9c90b3a7dce2..e5e299260d0c 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -403,3 +403,26 @@ int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str, return 1; } EXPORT_SYMBOL(trace_seq_hex_dump); + +/* + * trace_seq_acquire - acquire seq buffer with size len + * @s: trace sequence descriptor + * @len: size of buffer to be acquired + * + * acquire buffer with size of @len from trace_seq for output usage, + * user can fill string into that buffer. + * + * Returns start address of acquired buffer. + * + * it allow multiple usage in one trace output function call. + */ +char *trace_seq_acquire(struct trace_seq *s, unsigned int len) +{ + char *ret = trace_seq_buffer_ptr(s); + + if (!WARN_ON_ONCE(seq_buf_buffer_left(&s->seq) < len)) + seq_buf_commit(&s->seq, len); + + return ret; +} +EXPORT_SYMBOL(trace_seq_acquire); diff --git a/kernel/trace/trace_synth.h b/kernel/trace/trace_synth.h index b29595fe3ac5..43f6fb6078db 100644 --- a/kernel/trace/trace_synth.h +++ b/kernel/trace/trace_synth.h @@ -18,6 +18,7 @@ struct synth_field { bool is_signed; bool is_string; bool is_dynamic; + bool is_stack; }; struct synth_event { diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8d64b6553aed..8b92e34ff0c8 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -220,6 +220,7 @@ process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, { struct pt_regs *regs = rec; unsigned long val; + int ret; /* 1st stage: get value from context */ switch (code->op) { @@ -235,20 +236,16 @@ process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, case FETCH_OP_RETVAL: val = regs_return_value(regs); break; - case FETCH_OP_IMM: - val = code->immediate; - break; case FETCH_OP_COMM: val = FETCH_TOKEN_COMM; break; - case FETCH_OP_DATA: - val = (unsigned long)code->data; - break; case FETCH_OP_FOFFS: val = translate_user_vaddr(code->immediate); break; default: - return -EILSEQ; + ret = process_common_fetch_insn(code, &val); + if (ret < 0) + return ret; } code++; @@ -1042,7 +1039,7 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e data = DATAOF_TRACE_ENTRY(entry, false); } - if (print_probe_args(s, tu->tp.args, tu->tp.nr_args, data, entry) < 0) + if (trace_probe_print_args(s, tu->tp.args, tu->tp.nr_args, data, entry) < 0) goto out; trace_seq_putc(s, '\n'); diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index f23144af5743..8d1507dd0724 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -571,8 +571,8 @@ static void for_each_tracepoint_range( bool trace_module_has_bad_taint(struct module *mod) { return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP) | - (1 << TAINT_UNSIGNED_MODULE) | - (1 << TAINT_TEST)); + (1 << TAINT_UNSIGNED_MODULE) | (1 << TAINT_TEST) | + (1 << TAINT_LIVEPATCH)); } static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list); diff --git a/kernel/umh.c b/kernel/umh.c index fbf872c624cb..2a4708277335 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -501,9 +501,9 @@ static int proc_cap_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; - unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; - kernel_cap_t new_cap; - int err, i; + unsigned long cap_array[2]; + kernel_cap_t new_cap, *cap; + int err; if (write && (!capable(CAP_SETPCAP) || !capable(CAP_SYS_MODULE))) @@ -514,14 +514,16 @@ static int proc_cap_handler(struct ctl_table *table, int write, * userspace if this is a read. */ spin_lock(&umh_sysctl_lock); - for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) { - if (table->data == CAP_BSET) - cap_array[i] = usermodehelper_bset.cap[i]; - else if (table->data == CAP_PI) - cap_array[i] = usermodehelper_inheritable.cap[i]; - else - BUG(); - } + if (table->data == CAP_BSET) + cap = &usermodehelper_bset; + else if (table->data == CAP_PI) + cap = &usermodehelper_inheritable; + else + BUG(); + + /* Legacy format: capabilities are exposed as two 32-bit values */ + cap_array[0] = (u32) cap->val; + cap_array[1] = cap->val >> 32; spin_unlock(&umh_sysctl_lock); t = *table; @@ -535,22 +537,15 @@ static int proc_cap_handler(struct ctl_table *table, int write, if (err < 0) return err; - /* - * convert from the sysctl array of ulongs to the kernel_cap_t - * internal representation - */ - for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) - new_cap.cap[i] = cap_array[i]; + new_cap.val = (u32)cap_array[0]; + new_cap.val += (u64)cap_array[1] << 32; /* * Drop everything not in the new_cap (but don't add things) */ if (write) { spin_lock(&umh_sysctl_lock); - if (table->data == CAP_BSET) - usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap); - if (table->data == CAP_PI) - usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap); + *cap = cap_intersect(*cap, new_cap); spin_unlock(&umh_sysctl_lock); } @@ -561,14 +556,14 @@ struct ctl_table usermodehelper_table[] = { { .procname = "bset", .data = CAP_BSET, - .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), + .maxlen = 2 * sizeof(unsigned long), .mode = 0600, .proc_handler = proc_cap_handler, }, { .procname = "inheritable", .data = CAP_PI, - .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), + .maxlen = 2 * sizeof(unsigned long), .mode = 0600, .proc_handler = proc_cap_handler, }, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 54211dbd516c..1d8e47bed3f1 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -229,7 +229,7 @@ void __put_user_ns(struct user_namespace *ns) EXPORT_SYMBOL(__put_user_ns); /** - * idmap_key struct holds the information necessary to find an idmapping in a + * struct idmap_key - holds the information necessary to find an idmapping in a * sorted idmap array. It is passed to cmp_map_id() as first argument. */ struct idmap_key { |