diff options
Diffstat (limited to 'kernel')
157 files changed, 5917 insertions, 2890 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index af601b9bda0e..dddf51266719 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -97,7 +97,6 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_LATENCYTOP) += latencytop.o -obj-$(CONFIG_ELFCORE) += elfcore.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_TRACE_CLOCK) += trace/ @@ -123,6 +122,7 @@ obj-$(CONFIG_HAS_IOMEM) += iomem.o obj-$(CONFIG_RSEQ) += rseq.o obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o +obj-$(CONFIG_RESOURCE_KUNIT_TEST) += resource_kunit.o obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o CFLAGS_stackleak.o += $(DISABLE_STACKLEAK_PLUGIN) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 8dba8f0983b5..c00aa5837965 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -952,7 +952,7 @@ int audit_alloc(struct task_struct *tsk) state = audit_filter_task(tsk, &key); if (state == AUDIT_DISABLED) { - clear_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); + clear_task_syscall_work(tsk, SYSCALL_AUDIT); return 0; } @@ -964,7 +964,7 @@ int audit_alloc(struct task_struct *tsk) context->filterkey = key; audit_set_context(tsk, context); - set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); + set_task_syscall_work(tsk, SYSCALL_AUDIT); return 0; } diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index bdc8cd1b6767..d1249340fd6b 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,11 +1,16 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := core.o -CFLAGS_core.o += $(call cc-disable-warning, override-init) +ifneq ($(CONFIG_BPF_JIT_ALWAYS_ON),y) +# ___bpf_prog_run() needs GCSE disabled on x86; see 3193c0836f203 for details +cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse +endif +CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o +obj-${CONFIG_BPF_LSM} += bpf_task_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index c6c81eceb68f..1f8453343bf2 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -34,8 +34,8 @@ static int bpf_array_alloc_percpu(struct bpf_array *array) int i; for (i = 0; i < array->map.max_entries; i++) { - ptr = __alloc_percpu_gfp(array->elem_size, 8, - GFP_USER | __GFP_NOWARN); + ptr = bpf_map_alloc_percpu(&array->map, array->elem_size, 8, + GFP_USER | __GFP_NOWARN); if (!ptr) { bpf_array_free_percpu(array); return -ENOMEM; @@ -81,11 +81,10 @@ int array_map_alloc_check(union bpf_attr *attr) static struct bpf_map *array_map_alloc(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; - int ret, numa_node = bpf_map_attr_numa_node(attr); + int numa_node = bpf_map_attr_numa_node(attr); u32 elem_size, index_mask, max_entries; bool bypass_spec_v1 = bpf_bypass_spec_v1(); - u64 cost, array_size, mask64; - struct bpf_map_memory mem; + u64 array_size, mask64; struct bpf_array *array; elem_size = round_up(attr->value_size, 8); @@ -126,44 +125,29 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) } } - /* make sure there is no u32 overflow later in round_up() */ - cost = array_size; - if (percpu) - cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); - - ret = bpf_map_charge_init(&mem, cost); - if (ret < 0) - return ERR_PTR(ret); - /* allocate all map elements and zero-initialize them */ if (attr->map_flags & BPF_F_MMAPABLE) { void *data; /* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */ data = bpf_map_area_mmapable_alloc(array_size, numa_node); - if (!data) { - bpf_map_charge_finish(&mem); + if (!data) return ERR_PTR(-ENOMEM); - } array = data + PAGE_ALIGN(sizeof(struct bpf_array)) - offsetof(struct bpf_array, value); } else { array = bpf_map_area_alloc(array_size, numa_node); } - if (!array) { - bpf_map_charge_finish(&mem); + if (!array) return ERR_PTR(-ENOMEM); - } array->index_mask = index_mask; array->map.bypass_spec_v1 = bypass_spec_v1; /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); - bpf_map_charge_move(&array->map.memory, &mem); array->elem_size = elem_size; if (percpu && bpf_array_alloc_percpu(array)) { - bpf_map_charge_finish(&array->map.memory); bpf_map_area_free(array); return ERR_PTR(-ENOMEM); } @@ -1018,7 +1002,7 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) struct bpf_array_aux *aux; struct bpf_map *map; - aux = kzalloc(sizeof(*aux), GFP_KERNEL); + aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT); if (!aux) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 8f10e30ea0b0..5454161407f1 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -67,6 +67,15 @@ static void bpf_iter_done_stop(struct seq_file *seq) iter_priv->done_stop = true; } +static bool bpf_iter_support_resched(struct seq_file *seq) +{ + struct bpf_iter_priv_data *iter_priv; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + return iter_priv->tinfo->reg_info->feature & BPF_ITER_RESCHED; +} + /* maximum visited objects before bailing out */ #define MAX_ITER_OBJECTS 1000000 @@ -83,6 +92,7 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, struct seq_file *seq = file->private_data; size_t n, offs, copied = 0; int err = 0, num_objs = 0; + bool can_resched; void *p; mutex_lock(&seq->lock); @@ -135,6 +145,7 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, goto done; } + can_resched = bpf_iter_support_resched(seq); while (1) { loff_t pos = seq->index; @@ -180,6 +191,9 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, } break; } + + if (can_resched) + cond_resched(); } stop: offs = seq->count; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 5d3a7af9ba9b..dd5aedee99e7 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -67,7 +67,8 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, if (charge_mem && mem_charge(smap, owner, smap->elem_size)) return NULL; - selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN); + selem = bpf_map_kzalloc(&smap->map, smap->elem_size, + GFP_ATOMIC | __GFP_NOWARN); if (selem) { if (value) memcpy(SDATA(selem)->data, value, smap->map.value_size); @@ -264,7 +265,8 @@ int bpf_local_storage_alloc(void *owner, if (err) return err; - storage = kzalloc(sizeof(*storage), GFP_ATOMIC | __GFP_NOWARN); + storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), + GFP_ATOMIC | __GFP_NOWARN); if (!storage) { err = -ENOMEM; goto uncharge; @@ -543,10 +545,8 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) struct bpf_local_storage_map *smap; unsigned int i; u32 nbuckets; - u64 cost; - int ret; - smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN); + smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); if (!smap) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&smap->map, attr); @@ -555,18 +555,10 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ nbuckets = max_t(u32, 2, nbuckets); smap->bucket_log = ilog2(nbuckets); - cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); - - ret = bpf_map_charge_init(&smap->map.memory, cost); - if (ret < 0) { - kfree(smap); - return ERR_PTR(ret); - } smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, - GFP_USER | __GFP_NOWARN); + GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); if (!smap->buckets) { - bpf_map_charge_finish(&smap->map.memory); kfree(smap); return ERR_PTR(-ENOMEM); } diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 78ea8a7bd27f..70e5e0b6d69d 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -7,12 +7,15 @@ #include <linux/filter.h> #include <linux/bpf.h> #include <linux/btf.h> +#include <linux/binfmts.h> #include <linux/lsm_hooks.h> #include <linux/bpf_lsm.h> #include <linux/kallsyms.h> #include <linux/bpf_verifier.h> #include <net/bpf_sk_storage.h> #include <linux/bpf_local_storage.h> +#include <linux/btf_ids.h> +#include <linux/ima.h> /* For every LSM hook that allows attachment of BPF programs, declare a nop * function where a BPF program can be attached. @@ -26,7 +29,11 @@ noinline RET bpf_lsm_##NAME(__VA_ARGS__) \ #include <linux/lsm_hook_defs.h> #undef LSM_HOOK -#define BPF_LSM_SYM_PREFX "bpf_lsm_" +#define LSM_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME) +BTF_SET_START(bpf_lsm_hooks) +#include <linux/lsm_hook_defs.h> +#undef LSM_HOOK +BTF_SET_END(bpf_lsm_hooks) int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) @@ -37,8 +44,7 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, return -EINVAL; } - if (strncmp(BPF_LSM_SYM_PREFX, prog->aux->attach_func_name, - sizeof(BPF_LSM_SYM_PREFX) - 1)) { + if (!btf_id_set_contains(&bpf_lsm_hooks, prog->aux->attach_btf_id)) { bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n", prog->aux->attach_btf_id, prog->aux->attach_func_name); return -EINVAL; @@ -47,6 +53,52 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, return 0; } +/* Mask for all the currently supported BPRM option flags */ +#define BPF_F_BRPM_OPTS_MASK BPF_F_BPRM_SECUREEXEC + +BPF_CALL_2(bpf_bprm_opts_set, struct linux_binprm *, bprm, u64, flags) +{ + if (flags & ~BPF_F_BRPM_OPTS_MASK) + return -EINVAL; + + bprm->secureexec = (flags & BPF_F_BPRM_SECUREEXEC); + return 0; +} + +BTF_ID_LIST_SINGLE(bpf_bprm_opts_set_btf_ids, struct, linux_binprm) + +const static struct bpf_func_proto bpf_bprm_opts_set_proto = { + .func = bpf_bprm_opts_set, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_bprm_opts_set_btf_ids[0], + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_ima_inode_hash, struct inode *, inode, void *, dst, u32, size) +{ + return ima_inode_hash(inode, dst, size); +} + +static bool bpf_ima_inode_hash_allowed(const struct bpf_prog *prog) +{ + return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id); +} + +BTF_ID_LIST_SINGLE(bpf_ima_inode_hash_btf_ids, struct, inode) + +const static struct bpf_func_proto bpf_ima_inode_hash_proto = { + .func = bpf_ima_inode_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_ima_inode_hash_btf_ids[0], + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, + .allowed = bpf_ima_inode_hash_allowed, +}; + static const struct bpf_func_proto * bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -59,11 +111,103 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_spin_lock: + return &bpf_spin_lock_proto; + case BPF_FUNC_spin_unlock: + return &bpf_spin_unlock_proto; + case BPF_FUNC_task_storage_get: + return &bpf_task_storage_get_proto; + case BPF_FUNC_task_storage_delete: + return &bpf_task_storage_delete_proto; + case BPF_FUNC_bprm_opts_set: + return &bpf_bprm_opts_set_proto; + case BPF_FUNC_ima_inode_hash: + return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL; default: return tracing_prog_func_proto(func_id, prog); } } +/* The set of hooks which are called without pagefaults disabled and are allowed + * to "sleep" and thus can be used for sleeable BPF programs. + */ +BTF_SET_START(sleepable_lsm_hooks) +BTF_ID(func, bpf_lsm_bpf) +BTF_ID(func, bpf_lsm_bpf_map) +BTF_ID(func, bpf_lsm_bpf_map_alloc_security) +BTF_ID(func, bpf_lsm_bpf_map_free_security) +BTF_ID(func, bpf_lsm_bpf_prog) +BTF_ID(func, bpf_lsm_bprm_check_security) +BTF_ID(func, bpf_lsm_bprm_committed_creds) +BTF_ID(func, bpf_lsm_bprm_committing_creds) +BTF_ID(func, bpf_lsm_bprm_creds_for_exec) +BTF_ID(func, bpf_lsm_bprm_creds_from_file) +BTF_ID(func, bpf_lsm_capget) +BTF_ID(func, bpf_lsm_capset) +BTF_ID(func, bpf_lsm_cred_prepare) +BTF_ID(func, bpf_lsm_file_ioctl) +BTF_ID(func, bpf_lsm_file_lock) +BTF_ID(func, bpf_lsm_file_open) +BTF_ID(func, bpf_lsm_file_receive) +BTF_ID(func, bpf_lsm_inet_conn_established) +BTF_ID(func, bpf_lsm_inode_create) +BTF_ID(func, bpf_lsm_inode_free_security) +BTF_ID(func, bpf_lsm_inode_getattr) +BTF_ID(func, bpf_lsm_inode_getxattr) +BTF_ID(func, bpf_lsm_inode_mknod) +BTF_ID(func, bpf_lsm_inode_need_killpriv) +BTF_ID(func, bpf_lsm_inode_post_setxattr) +BTF_ID(func, bpf_lsm_inode_readlink) +BTF_ID(func, bpf_lsm_inode_rename) +BTF_ID(func, bpf_lsm_inode_rmdir) +BTF_ID(func, bpf_lsm_inode_setattr) +BTF_ID(func, bpf_lsm_inode_setxattr) +BTF_ID(func, bpf_lsm_inode_symlink) +BTF_ID(func, bpf_lsm_inode_unlink) +BTF_ID(func, bpf_lsm_kernel_module_request) +BTF_ID(func, bpf_lsm_kernfs_init_security) +BTF_ID(func, bpf_lsm_key_free) +BTF_ID(func, bpf_lsm_mmap_file) +BTF_ID(func, bpf_lsm_netlink_send) +BTF_ID(func, bpf_lsm_path_notify) +BTF_ID(func, bpf_lsm_release_secctx) +BTF_ID(func, bpf_lsm_sb_alloc_security) +BTF_ID(func, bpf_lsm_sb_eat_lsm_opts) +BTF_ID(func, bpf_lsm_sb_kern_mount) +BTF_ID(func, bpf_lsm_sb_mount) +BTF_ID(func, bpf_lsm_sb_remount) +BTF_ID(func, bpf_lsm_sb_set_mnt_opts) +BTF_ID(func, bpf_lsm_sb_show_options) +BTF_ID(func, bpf_lsm_sb_statfs) +BTF_ID(func, bpf_lsm_sb_umount) +BTF_ID(func, bpf_lsm_settime) +BTF_ID(func, bpf_lsm_socket_accept) +BTF_ID(func, bpf_lsm_socket_bind) +BTF_ID(func, bpf_lsm_socket_connect) +BTF_ID(func, bpf_lsm_socket_create) +BTF_ID(func, bpf_lsm_socket_getpeername) +BTF_ID(func, bpf_lsm_socket_getpeersec_dgram) +BTF_ID(func, bpf_lsm_socket_getsockname) +BTF_ID(func, bpf_lsm_socket_getsockopt) +BTF_ID(func, bpf_lsm_socket_listen) +BTF_ID(func, bpf_lsm_socket_post_create) +BTF_ID(func, bpf_lsm_socket_recvmsg) +BTF_ID(func, bpf_lsm_socket_sendmsg) +BTF_ID(func, bpf_lsm_socket_shutdown) +BTF_ID(func, bpf_lsm_socket_socketpair) +BTF_ID(func, bpf_lsm_syslog) +BTF_ID(func, bpf_lsm_task_alloc) +BTF_ID(func, bpf_lsm_task_getsecid) +BTF_ID(func, bpf_lsm_task_prctl) +BTF_ID(func, bpf_lsm_task_setscheduler) +BTF_ID(func, bpf_lsm_task_to_inode) +BTF_SET_END(sleepable_lsm_hooks) + +bool bpf_lsm_is_sleepable_hook(u32 btf_id) +{ + return btf_id_set_contains(&sleepable_lsm_hooks, btf_id); +} + const struct bpf_prog_ops lsm_prog_ops = { }; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 4c3b543bb33b..1a666a975416 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -548,12 +548,10 @@ static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) { const struct bpf_struct_ops *st_ops; - size_t map_total_size, st_map_size; + size_t st_map_size; struct bpf_struct_ops_map *st_map; const struct btf_type *t, *vt; - struct bpf_map_memory mem; struct bpf_map *map; - int err; if (!bpf_capable()) return ERR_PTR(-EPERM); @@ -573,20 +571,11 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) * struct bpf_struct_ops_tcp_congestions_ops */ (vt->size - sizeof(struct bpf_struct_ops_value)); - map_total_size = st_map_size + - /* uvalue */ - sizeof(vt->size) + - /* struct bpf_progs **progs */ - btf_type_vlen(t) * sizeof(struct bpf_prog *); - err = bpf_map_charge_init(&mem, map_total_size); - if (err < 0) - return ERR_PTR(err); st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE); - if (!st_map) { - bpf_map_charge_finish(&mem); + if (!st_map) return ERR_PTR(-ENOMEM); - } + st_map->st_ops = st_ops; map = &st_map->map; @@ -597,14 +586,12 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); if (!st_map->uvalue || !st_map->progs || !st_map->image) { bpf_struct_ops_map_free(map); - bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); } mutex_init(&st_map->lock); set_vm_flush_reset_perms(st_map->image); bpf_map_init_from_attr(map, attr); - bpf_map_charge_move(&map->memory, &mem); return map; } diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c new file mode 100644 index 000000000000..4ef1959a78f2 --- /dev/null +++ b/kernel/bpf/bpf_task_storage.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020 Facebook + * Copyright 2020 Google LLC. + */ + +#include <linux/pid.h> +#include <linux/sched.h> +#include <linux/rculist.h> +#include <linux/list.h> +#include <linux/hash.h> +#include <linux/types.h> +#include <linux/spinlock.h> +#include <linux/bpf.h> +#include <linux/bpf_local_storage.h> +#include <linux/filter.h> +#include <uapi/linux/btf.h> +#include <linux/bpf_lsm.h> +#include <linux/btf_ids.h> +#include <linux/fdtable.h> + +DEFINE_BPF_STORAGE_CACHE(task_cache); + +static struct bpf_local_storage __rcu **task_storage_ptr(void *owner) +{ + struct task_struct *task = owner; + struct bpf_storage_blob *bsb; + + bsb = bpf_task(task); + if (!bsb) + return NULL; + return &bsb->storage; +} + +static struct bpf_local_storage_data * +task_storage_lookup(struct task_struct *task, struct bpf_map *map, + bool cacheit_lockit) +{ + struct bpf_local_storage *task_storage; + struct bpf_local_storage_map *smap; + struct bpf_storage_blob *bsb; + + bsb = bpf_task(task); + if (!bsb) + return NULL; + + task_storage = rcu_dereference(bsb->storage); + if (!task_storage) + return NULL; + + smap = (struct bpf_local_storage_map *)map; + return bpf_local_storage_lookup(task_storage, smap, cacheit_lockit); +} + +void bpf_task_storage_free(struct task_struct *task) +{ + struct bpf_local_storage_elem *selem; + struct bpf_local_storage *local_storage; + bool free_task_storage = false; + struct bpf_storage_blob *bsb; + struct hlist_node *n; + + bsb = bpf_task(task); + if (!bsb) + return; + + rcu_read_lock(); + + local_storage = rcu_dereference(bsb->storage); + if (!local_storage) { + rcu_read_unlock(); + return; + } + + /* Neither the bpf_prog nor the bpf-map's syscall + * could be modifying the local_storage->list now. + * Thus, no elem can be added-to or deleted-from the + * local_storage->list by the bpf_prog or by the bpf-map's syscall. + * + * It is racing with bpf_local_storage_map_free() alone + * when unlinking elem from the local_storage->list and + * the map's bucket->list. + */ + raw_spin_lock_bh(&local_storage->lock); + hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { + /* Always unlink from map before unlinking from + * local_storage. + */ + bpf_selem_unlink_map(selem); + free_task_storage = bpf_selem_unlink_storage_nolock( + local_storage, selem, false); + } + raw_spin_unlock_bh(&local_storage->lock); + rcu_read_unlock(); + + /* free_task_storage should always be true as long as + * local_storage->list was non-empty. + */ + if (free_task_storage) + kfree_rcu(local_storage, rcu); +} + +static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_local_storage_data *sdata; + struct task_struct *task; + unsigned int f_flags; + struct pid *pid; + int fd, err; + + fd = *(int *)key; + pid = pidfd_get_pid(fd, &f_flags); + if (IS_ERR(pid)) + return ERR_CAST(pid); + + /* We should be in an RCU read side critical section, it should be safe + * to call pid_task. + */ + WARN_ON_ONCE(!rcu_read_lock_held()); + task = pid_task(pid, PIDTYPE_PID); + if (!task) { + err = -ENOENT; + goto out; + } + + sdata = task_storage_lookup(task, map, true); + put_pid(pid); + return sdata ? sdata->data : NULL; +out: + put_pid(pid); + return ERR_PTR(err); +} + +static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct bpf_local_storage_data *sdata; + struct task_struct *task; + unsigned int f_flags; + struct pid *pid; + int fd, err; + + fd = *(int *)key; + pid = pidfd_get_pid(fd, &f_flags); + if (IS_ERR(pid)) + return PTR_ERR(pid); + + /* We should be in an RCU read side critical section, it should be safe + * to call pid_task. + */ + WARN_ON_ONCE(!rcu_read_lock_held()); + task = pid_task(pid, PIDTYPE_PID); + if (!task || !task_storage_ptr(task)) { + err = -ENOENT; + goto out; + } + + sdata = bpf_local_storage_update( + task, (struct bpf_local_storage_map *)map, value, map_flags); + + err = PTR_ERR_OR_ZERO(sdata); +out: + put_pid(pid); + return err; +} + +static int task_storage_delete(struct task_struct *task, struct bpf_map *map) +{ + struct bpf_local_storage_data *sdata; + + sdata = task_storage_lookup(task, map, false); + if (!sdata) + return -ENOENT; + + bpf_selem_unlink(SELEM(sdata)); + + return 0; +} + +static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) +{ + struct task_struct *task; + unsigned int f_flags; + struct pid *pid; + int fd, err; + + fd = *(int *)key; + pid = pidfd_get_pid(fd, &f_flags); + if (IS_ERR(pid)) + return PTR_ERR(pid); + + /* We should be in an RCU read side critical section, it should be safe + * to call pid_task. + */ + WARN_ON_ONCE(!rcu_read_lock_held()); + task = pid_task(pid, PIDTYPE_PID); + if (!task) { + err = -ENOENT; + goto out; + } + + err = task_storage_delete(task, map); +out: + put_pid(pid); + return err; +} + +BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, + task, void *, value, u64, flags) +{ + struct bpf_local_storage_data *sdata; + + if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) + return (unsigned long)NULL; + + /* explicitly check that the task_storage_ptr is not + * NULL as task_storage_lookup returns NULL in this case and + * bpf_local_storage_update expects the owner to have a + * valid storage pointer. + */ + if (!task_storage_ptr(task)) + return (unsigned long)NULL; + + sdata = task_storage_lookup(task, map, true); + if (sdata) + return (unsigned long)sdata->data; + + /* This helper must only be called from places where the lifetime of the task + * is guaranteed. Either by being refcounted or by being protected + * by an RCU read-side critical section. + */ + if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { + sdata = bpf_local_storage_update( + task, (struct bpf_local_storage_map *)map, value, + BPF_NOEXIST); + return IS_ERR(sdata) ? (unsigned long)NULL : + (unsigned long)sdata->data; + } + + return (unsigned long)NULL; +} + +BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *, + task) +{ + /* This helper must only be called from places where the lifetime of the task + * is guaranteed. Either by being refcounted or by being protected + * by an RCU read-side critical section. + */ + return task_storage_delete(task, map); +} + +static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -ENOTSUPP; +} + +static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr) +{ + struct bpf_local_storage_map *smap; + + smap = bpf_local_storage_map_alloc(attr); + if (IS_ERR(smap)) + return ERR_CAST(smap); + + smap->cache_idx = bpf_local_storage_cache_idx_get(&task_cache); + return &smap->map; +} + +static void task_storage_map_free(struct bpf_map *map) +{ + struct bpf_local_storage_map *smap; + + smap = (struct bpf_local_storage_map *)map; + bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx); + bpf_local_storage_map_free(smap); +} + +static int task_storage_map_btf_id; +const struct bpf_map_ops task_storage_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc_check = bpf_local_storage_map_alloc_check, + .map_alloc = task_storage_map_alloc, + .map_free = task_storage_map_free, + .map_get_next_key = notsupp_get_next_key, + .map_lookup_elem = bpf_pid_task_storage_lookup_elem, + .map_update_elem = bpf_pid_task_storage_update_elem, + .map_delete_elem = bpf_pid_task_storage_delete_elem, + .map_check_btf = bpf_local_storage_map_check_btf, + .map_btf_name = "bpf_local_storage_map", + .map_btf_id = &task_storage_map_btf_id, + .map_owner_storage_ptr = task_storage_ptr, +}; + +BTF_ID_LIST_SINGLE(bpf_task_storage_btf_ids, struct, task_struct) + +const struct bpf_func_proto bpf_task_storage_get_proto = { + .func = bpf_task_storage_get, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &bpf_task_storage_btf_ids[0], + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; + +const struct bpf_func_proto bpf_task_storage_delete_proto = { + .func = bpf_task_storage_delete, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &bpf_task_storage_btf_ids[0], +}; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ed7d02e8bc93..8d6bdb4f4d61 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -22,7 +22,8 @@ #include <linux/skmsg.h> #include <linux/perf_event.h> #include <linux/bsearch.h> -#include <linux/btf_ids.h> +#include <linux/kobject.h> +#include <linux/sysfs.h> #include <net/sock.h> /* BTF (BPF Type Format) is the meta data format which describes @@ -204,12 +205,19 @@ struct btf { const char *strings; void *nohdr_data; struct btf_header hdr; - u32 nr_types; + u32 nr_types; /* includes VOID for base BTF */ u32 types_size; u32 data_size; refcount_t refcnt; u32 id; struct rcu_head rcu; + + /* split BTF support */ + struct btf *base_btf; + u32 start_id; /* first type ID in this BTF (0 for base BTF) */ + u32 start_str_off; /* first string offset (0 for base BTF) */ + char name[MODULE_NAME_LEN]; + bool kernel_btf; }; enum verifier_phase { @@ -450,14 +458,27 @@ static bool btf_type_is_datasec(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; } +static u32 btf_nr_types_total(const struct btf *btf) +{ + u32 total = 0; + + while (btf) { + total += btf->nr_types; + btf = btf->base_btf; + } + + return total; +} + s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) { const struct btf_type *t; const char *tname; - u32 i; + u32 i, total; - for (i = 1; i <= btf->nr_types; i++) { - t = btf->types[i]; + total = btf_nr_types_total(btf); + for (i = 1; i < total; i++) { + t = btf_type_by_id(btf, i); if (BTF_INFO_KIND(t->info) != kind) continue; @@ -600,8 +621,14 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) static bool btf_name_offset_valid(const struct btf *btf, u32 offset) { - return BTF_STR_OFFSET_VALID(offset) && - offset < btf->hdr.str_len; + if (!BTF_STR_OFFSET_VALID(offset)) + return false; + + while (offset < btf->start_str_off) + btf = btf->base_btf; + + offset -= btf->start_str_off; + return offset < btf->hdr.str_len; } static bool __btf_name_char_ok(char c, bool first, bool dot_ok) @@ -615,10 +642,22 @@ static bool __btf_name_char_ok(char c, bool first, bool dot_ok) return true; } +static const char *btf_str_by_offset(const struct btf *btf, u32 offset) +{ + while (offset < btf->start_str_off) + btf = btf->base_btf; + + offset -= btf->start_str_off; + if (offset < btf->hdr.str_len) + return &btf->strings[offset]; + + return NULL; +} + static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok) { /* offset must be valid */ - const char *src = &btf->strings[offset]; + const char *src = btf_str_by_offset(btf, offset); const char *src_limit; if (!__btf_name_char_ok(*src, true, dot_ok)) @@ -651,27 +690,28 @@ static bool btf_name_valid_section(const struct btf *btf, u32 offset) static const char *__btf_name_by_offset(const struct btf *btf, u32 offset) { + const char *name; + if (!offset) return "(anon)"; - else if (offset < btf->hdr.str_len) - return &btf->strings[offset]; - else - return "(invalid-name-offset)"; + + name = btf_str_by_offset(btf, offset); + return name ?: "(invalid-name-offset)"; } const char *btf_name_by_offset(const struct btf *btf, u32 offset) { - if (offset < btf->hdr.str_len) - return &btf->strings[offset]; - - return NULL; + return btf_str_by_offset(btf, offset); } const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) { - if (type_id > btf->nr_types) - return NULL; + while (type_id < btf->start_id) + btf = btf->base_btf; + type_id -= btf->start_id; + if (type_id >= btf->nr_types) + return NULL; return btf->types[type_id]; } @@ -1391,17 +1431,13 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) { struct btf *btf = env->btf; - /* < 2 because +1 for btf_void which is always in btf->types[0]. - * btf_void is not accounted in btf->nr_types because btf_void - * does not come from the BTF file. - */ - if (btf->types_size - btf->nr_types < 2) { + if (btf->types_size == btf->nr_types) { /* Expand 'types' array */ struct btf_type **new_types; u32 expand_by, new_size; - if (btf->types_size == BTF_MAX_TYPE) { + if (btf->start_id + btf->types_size == BTF_MAX_TYPE) { btf_verifier_log(env, "Exceeded max num of types"); return -E2BIG; } @@ -1415,18 +1451,23 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) if (!new_types) return -ENOMEM; - if (btf->nr_types == 0) - new_types[0] = &btf_void; - else + if (btf->nr_types == 0) { + if (!btf->base_btf) { + /* lazily init VOID type */ + new_types[0] = &btf_void; + btf->nr_types++; + } + } else { memcpy(new_types, btf->types, - sizeof(*btf->types) * (btf->nr_types + 1)); + sizeof(*btf->types) * btf->nr_types); + } kvfree(btf->types); btf->types = new_types; btf->types_size = new_size; } - btf->types[++(btf->nr_types)] = t; + btf->types[btf->nr_types++] = t; return 0; } @@ -1483,6 +1524,11 @@ static void btf_free_rcu(struct rcu_head *rcu) btf_free(btf); } +void btf_get(struct btf *btf) +{ + refcount_inc(&btf->refcnt); +} + void btf_put(struct btf *btf) { if (btf && refcount_dec_and_test(&btf->refcnt)) { @@ -1499,18 +1545,17 @@ static int env_resolve_init(struct btf_verifier_env *env) u32 *resolved_ids = NULL; u8 *visit_states = NULL; - /* +1 for btf_void */ - resolved_sizes = kvcalloc(nr_types + 1, sizeof(*resolved_sizes), + resolved_sizes = kvcalloc(nr_types, sizeof(*resolved_sizes), GFP_KERNEL | __GFP_NOWARN); if (!resolved_sizes) goto nomem; - resolved_ids = kvcalloc(nr_types + 1, sizeof(*resolved_ids), + resolved_ids = kvcalloc(nr_types, sizeof(*resolved_ids), GFP_KERNEL | __GFP_NOWARN); if (!resolved_ids) goto nomem; - visit_states = kvcalloc(nr_types + 1, sizeof(*visit_states), + visit_states = kvcalloc(nr_types, sizeof(*visit_states), GFP_KERNEL | __GFP_NOWARN); if (!visit_states) goto nomem; @@ -1562,21 +1607,27 @@ static bool env_type_is_resolve_sink(const struct btf_verifier_env *env, static bool env_type_is_resolved(const struct btf_verifier_env *env, u32 type_id) { - return env->visit_states[type_id] == RESOLVED; + /* base BTF types should be resolved by now */ + if (type_id < env->btf->start_id) + return true; + + return env->visit_states[type_id - env->btf->start_id] == RESOLVED; } static int env_stack_push(struct btf_verifier_env *env, const struct btf_type *t, u32 type_id) { + const struct btf *btf = env->btf; struct resolve_vertex *v; if (env->top_stack == MAX_RESOLVE_DEPTH) return -E2BIG; - if (env->visit_states[type_id] != NOT_VISITED) + if (type_id < btf->start_id + || env->visit_states[type_id - btf->start_id] != NOT_VISITED) return -EEXIST; - env->visit_states[type_id] = VISITED; + env->visit_states[type_id - btf->start_id] = VISITED; v = &env->stack[env->top_stack++]; v->t = t; @@ -1606,6 +1657,7 @@ static void env_stack_pop_resolved(struct btf_verifier_env *env, u32 type_id = env->stack[--(env->top_stack)].type_id; struct btf *btf = env->btf; + type_id -= btf->start_id; /* adjust to local type id */ btf->resolved_sizes[type_id] = resolved_size; btf->resolved_ids[type_id] = resolved_type_id; env->visit_states[type_id] = RESOLVED; @@ -1710,14 +1762,30 @@ btf_resolve_size(const struct btf *btf, const struct btf_type *type, return __btf_resolve_size(btf, type, type_size, NULL, NULL, NULL, NULL); } +static u32 btf_resolved_type_id(const struct btf *btf, u32 type_id) +{ + while (type_id < btf->start_id) + btf = btf->base_btf; + + return btf->resolved_ids[type_id - btf->start_id]; +} + /* The input param "type_id" must point to a needs_resolve type */ static const struct btf_type *btf_type_id_resolve(const struct btf *btf, u32 *type_id) { - *type_id = btf->resolved_ids[*type_id]; + *type_id = btf_resolved_type_id(btf, *type_id); return btf_type_by_id(btf, *type_id); } +static u32 btf_resolved_type_size(const struct btf *btf, u32 type_id) +{ + while (type_id < btf->start_id) + btf = btf->base_btf; + + return btf->resolved_sizes[type_id - btf->start_id]; +} + const struct btf_type *btf_type_id_size(const struct btf *btf, u32 *type_id, u32 *ret_size) { @@ -1732,7 +1800,7 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, if (btf_type_has_size(size_type)) { size = size_type->size; } else if (btf_type_is_array(size_type)) { - size = btf->resolved_sizes[size_type_id]; + size = btf_resolved_type_size(btf, size_type_id); } else if (btf_type_is_ptr(size_type)) { size = sizeof(void *); } else { @@ -1740,14 +1808,14 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, !btf_type_is_var(size_type))) return NULL; - size_type_id = btf->resolved_ids[size_type_id]; + size_type_id = btf_resolved_type_id(btf, size_type_id); size_type = btf_type_by_id(btf, size_type_id); if (btf_type_nosize_or_null(size_type)) return NULL; else if (btf_type_has_size(size_type)) size = size_type->size; else if (btf_type_is_array(size_type)) - size = btf->resolved_sizes[size_type_id]; + size = btf_resolved_type_size(btf, size_type_id); else if (btf_type_is_ptr(size_type)) size = sizeof(void *); else @@ -3799,7 +3867,7 @@ static int btf_check_all_metas(struct btf_verifier_env *env) cur = btf->nohdr_data + hdr->type_off; end = cur + hdr->type_len; - env->log_type_id = 1; + env->log_type_id = btf->base_btf ? btf->start_id : 1; while (cur < end) { struct btf_type *t = cur; s32 meta_size; @@ -3826,8 +3894,8 @@ static bool btf_resolve_valid(struct btf_verifier_env *env, return false; if (btf_type_is_struct(t) || btf_type_is_datasec(t)) - return !btf->resolved_ids[type_id] && - !btf->resolved_sizes[type_id]; + return !btf_resolved_type_id(btf, type_id) && + !btf_resolved_type_size(btf, type_id); if (btf_type_is_modifier(t) || btf_type_is_ptr(t) || btf_type_is_var(t)) { @@ -3847,7 +3915,7 @@ static bool btf_resolve_valid(struct btf_verifier_env *env, elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); return elem_type && !btf_type_is_modifier(elem_type) && (array->nelems * elem_size == - btf->resolved_sizes[type_id]); + btf_resolved_type_size(btf, type_id)); } return false; @@ -3889,7 +3957,8 @@ static int btf_resolve(struct btf_verifier_env *env, static int btf_check_all_types(struct btf_verifier_env *env) { struct btf *btf = env->btf; - u32 type_id; + const struct btf_type *t; + u32 type_id, i; int err; err = env_resolve_init(env); @@ -3897,8 +3966,9 @@ static int btf_check_all_types(struct btf_verifier_env *env) return err; env->phase++; - for (type_id = 1; type_id <= btf->nr_types; type_id++) { - const struct btf_type *t = btf_type_by_id(btf, type_id); + for (i = btf->base_btf ? 0 : 1; i < btf->nr_types; i++) { + type_id = btf->start_id + i; + t = btf_type_by_id(btf, type_id); env->log_type_id = type_id; if (btf_type_needs_resolve(t) && @@ -3935,7 +4005,7 @@ static int btf_parse_type_sec(struct btf_verifier_env *env) return -EINVAL; } - if (!hdr->type_len) { + if (!env->btf->base_btf && !hdr->type_len) { btf_verifier_log(env, "No type found"); return -EINVAL; } @@ -3962,13 +4032,18 @@ static int btf_parse_str_sec(struct btf_verifier_env *env) return -EINVAL; } - if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || - start[0] || end[-1]) { + btf->strings = start; + + if (btf->base_btf && !hdr->str_len) + return 0; + if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || end[-1]) { + btf_verifier_log(env, "Invalid string section"); + return -EINVAL; + } + if (!btf->base_btf && start[0]) { btf_verifier_log(env, "Invalid string section"); return -EINVAL; } - - btf->strings = start; return 0; } @@ -4363,6 +4438,8 @@ struct btf *btf_parse_vmlinux(void) btf->data = __start_BTF; btf->data_size = __stop_BTF - __start_BTF; + btf->kernel_btf = true; + snprintf(btf->name, sizeof(btf->name), "vmlinux"); err = btf_parse_hdr(env); if (err) @@ -4388,6 +4465,81 @@ struct btf *btf_parse_vmlinux(void) bpf_struct_ops_init(btf, log); + refcount_set(&btf->refcnt, 1); + + err = btf_alloc_id(btf); + if (err) + goto errout; + + btf_verifier_env_free(env); + return btf; + +errout: + btf_verifier_env_free(env); + if (btf) { + kvfree(btf->types); + kfree(btf); + } + return ERR_PTR(err); +} + +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + +static struct btf *btf_parse_module(const char *module_name, const void *data, unsigned int data_size) +{ + struct btf_verifier_env *env = NULL; + struct bpf_verifier_log *log; + struct btf *btf = NULL, *base_btf; + int err; + + base_btf = bpf_get_btf_vmlinux(); + if (IS_ERR(base_btf)) + return base_btf; + if (!base_btf) + return ERR_PTR(-EINVAL); + + env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); + if (!env) + return ERR_PTR(-ENOMEM); + + log = &env->log; + log->level = BPF_LOG_KERNEL; + + btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN); + if (!btf) { + err = -ENOMEM; + goto errout; + } + env->btf = btf; + + btf->base_btf = base_btf; + btf->start_id = base_btf->nr_types; + btf->start_str_off = base_btf->hdr.str_len; + btf->kernel_btf = true; + snprintf(btf->name, sizeof(btf->name), "%s", module_name); + + btf->data = kvmalloc(data_size, GFP_KERNEL | __GFP_NOWARN); + if (!btf->data) { + err = -ENOMEM; + goto errout; + } + memcpy(btf->data, data, data_size); + btf->data_size = data_size; + + err = btf_parse_hdr(env); + if (err) + goto errout; + + btf->nohdr_data = btf->data + btf->hdr.hdr_len; + + err = btf_parse_str_sec(env); + if (err) + goto errout; + + err = btf_check_all_metas(env); + if (err) + goto errout; + btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); return btf; @@ -4395,21 +4547,23 @@ struct btf *btf_parse_vmlinux(void) errout: btf_verifier_env_free(env); if (btf) { + kvfree(btf->data); kvfree(btf->types); kfree(btf); } return ERR_PTR(err); } +#endif /* CONFIG_DEBUG_INFO_BTF_MODULES */ + struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) { struct bpf_prog *tgt_prog = prog->aux->dst_prog; - if (tgt_prog) { + if (tgt_prog) return tgt_prog->aux->btf; - } else { - return btf_vmlinux; - } + else + return prog->aux->attach_btf; } static bool is_string_ptr(struct btf *btf, const struct btf_type *t) @@ -4550,6 +4704,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (ctx_arg_info->offset == off) { info->reg_type = ctx_arg_info->reg_type; + info->btf = btf_vmlinux; info->btf_id = ctx_arg_info->btf_id; return true; } @@ -4566,6 +4721,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, ret = btf_translate_to_vmlinux(log, btf, t, tgt_type, arg); if (ret > 0) { + info->btf = btf_vmlinux; info->btf_id = ret; return true; } else { @@ -4573,6 +4729,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } } + info->btf = btf; info->btf_id = t->type; t = btf_type_by_id(btf, t->type); /* skip modifiers */ @@ -4599,7 +4756,7 @@ enum bpf_struct_walk_result { WALK_STRUCT, }; -static int btf_struct_walk(struct bpf_verifier_log *log, +static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, u32 *next_btf_id) { @@ -4610,7 +4767,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log, u32 vlen, elem_id, mid; again: - tname = __btf_name_by_offset(btf_vmlinux, t->name_off); + tname = __btf_name_by_offset(btf, t->name_off); if (!btf_type_is_struct(t)) { bpf_log(log, "Type '%s' is not a struct\n", tname); return -EINVAL; @@ -4627,7 +4784,7 @@ again: goto error; member = btf_type_member(t) + vlen - 1; - mtype = btf_type_skip_modifiers(btf_vmlinux, member->type, + mtype = btf_type_skip_modifiers(btf, member->type, NULL); if (!btf_type_is_array(mtype)) goto error; @@ -4643,7 +4800,7 @@ again: /* Only allow structure for now, can be relaxed for * other types later. */ - t = btf_type_skip_modifiers(btf_vmlinux, array_elem->type, + t = btf_type_skip_modifiers(btf, array_elem->type, NULL); if (!btf_type_is_struct(t)) goto error; @@ -4701,10 +4858,10 @@ error: /* type of the field */ mid = member->type; - mtype = btf_type_by_id(btf_vmlinux, member->type); - mname = __btf_name_by_offset(btf_vmlinux, member->name_off); + mtype = btf_type_by_id(btf, member->type); + mname = __btf_name_by_offset(btf, member->name_off); - mtype = __btf_resolve_size(btf_vmlinux, mtype, &msize, + mtype = __btf_resolve_size(btf, mtype, &msize, &elem_type, &elem_id, &total_nelems, &mid); if (IS_ERR(mtype)) { @@ -4799,7 +4956,7 @@ error: mname, moff, tname, off, size); return -EACCES; } - stype = btf_type_skip_modifiers(btf_vmlinux, mtype->type, &id); + stype = btf_type_skip_modifiers(btf, mtype->type, &id); if (btf_type_is_struct(stype)) { *next_btf_id = id; return WALK_PTR; @@ -4825,7 +4982,7 @@ error: return -EINVAL; } -int btf_struct_access(struct bpf_verifier_log *log, +int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype __maybe_unused, u32 *next_btf_id) @@ -4834,7 +4991,7 @@ int btf_struct_access(struct bpf_verifier_log *log, u32 id; do { - err = btf_struct_walk(log, t, off, size, &id); + err = btf_struct_walk(log, btf, t, off, size, &id); switch (err) { case WALK_PTR: @@ -4850,7 +5007,7 @@ int btf_struct_access(struct bpf_verifier_log *log, * by diving in it. At this point the offset is * aligned with the new type, so set it to 0. */ - t = btf_type_by_id(btf_vmlinux, id); + t = btf_type_by_id(btf, id); off = 0; break; default: @@ -4866,21 +5023,37 @@ int btf_struct_access(struct bpf_verifier_log *log, return -EINVAL; } +/* Check that two BTF types, each specified as an BTF object + id, are exactly + * the same. Trivial ID check is not enough due to module BTFs, because we can + * end up with two different module BTFs, but IDs point to the common type in + * vmlinux BTF. + */ +static bool btf_types_are_same(const struct btf *btf1, u32 id1, + const struct btf *btf2, u32 id2) +{ + if (id1 != id2) + return false; + if (btf1 == btf2) + return true; + return btf_type_by_id(btf1, id1) == btf_type_by_id(btf2, id2); +} + bool btf_struct_ids_match(struct bpf_verifier_log *log, - int off, u32 id, u32 need_type_id) + const struct btf *btf, u32 id, int off, + const struct btf *need_btf, u32 need_type_id) { const struct btf_type *type; int err; /* Are we already done? */ - if (need_type_id == id && off == 0) + if (off == 0 && btf_types_are_same(btf, id, need_btf, need_type_id)) return true; again: - type = btf_type_by_id(btf_vmlinux, id); + type = btf_type_by_id(btf, id); if (!type) return false; - err = btf_struct_walk(log, type, off, 1, &id); + err = btf_struct_walk(log, btf, type, off, 1, &id); if (err != WALK_STRUCT) return false; @@ -4889,7 +5062,7 @@ again: * continue the search with offset 0 in the new * type. */ - if (need_type_id != id) { + if (!btf_types_are_same(btf, id, need_btf, need_type_id)) { off = 0; goto again; } @@ -4909,7 +5082,7 @@ static int __get_type_size(struct btf *btf, u32 btf_id, while (t && btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (!t) { - *bad_type = btf->types[0]; + *bad_type = btf_type_by_id(btf, 0); return -EINVAL; } if (btf_type_is_ptr(t)) @@ -5487,7 +5660,9 @@ int btf_get_info_by_fd(const struct btf *btf, struct bpf_btf_info info; u32 info_copy, btf_copy; void __user *ubtf; - u32 uinfo_len; + char __user *uname; + u32 uinfo_len, uname_len, name_len; + int ret = 0; uinfo = u64_to_user_ptr(attr->info.info); uinfo_len = attr->info.info_len; @@ -5504,11 +5679,37 @@ int btf_get_info_by_fd(const struct btf *btf, return -EFAULT; info.btf_size = btf->data_size; + info.kernel_btf = btf->kernel_btf; + + uname = u64_to_user_ptr(info.name); + uname_len = info.name_len; + if (!uname ^ !uname_len) + return -EINVAL; + + name_len = strlen(btf->name); + info.name_len = name_len; + + if (uname) { + if (uname_len >= name_len + 1) { + if (copy_to_user(uname, btf->name, name_len + 1)) + return -EFAULT; + } else { + char zero = '\0'; + + if (copy_to_user(uname, btf->name, uname_len - 1)) + return -EFAULT; + if (put_user(zero, uname + uname_len - 1)) + return -EFAULT; + /* let user-space know about too short buffer */ + ret = -ENOSPC; + } + } + if (copy_to_user(uinfo, &info, info_copy) || put_user(info_copy, &uattr->info.info_len)) return -EFAULT; - return 0; + return ret; } int btf_get_fd_by_id(u32 id) @@ -5532,11 +5733,16 @@ int btf_get_fd_by_id(u32 id) return fd; } -u32 btf_id(const struct btf *btf) +u32 btf_obj_id(const struct btf *btf) { return btf->id; } +bool btf_is_kernel(const struct btf *btf) +{ + return btf->kernel_btf; +} + static int btf_id_cmp_func(const void *a, const void *b) { const int *pa = a, *pb = b; @@ -5548,3 +5754,126 @@ bool btf_id_set_contains(const struct btf_id_set *set, u32 id) { return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; } + +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES +struct btf_module { + struct list_head list; + struct module *module; + struct btf *btf; + struct bin_attribute *sysfs_attr; +}; + +static LIST_HEAD(btf_modules); +static DEFINE_MUTEX(btf_module_mutex); + +static ssize_t +btf_module_read(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t len) +{ + const struct btf *btf = bin_attr->private; + + memcpy(buf, btf->data + off, len); + return len; +} + +static int btf_module_notify(struct notifier_block *nb, unsigned long op, + void *module) +{ + struct btf_module *btf_mod, *tmp; + struct module *mod = module; + struct btf *btf; + int err = 0; + + if (mod->btf_data_size == 0 || + (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING)) + goto out; + + switch (op) { + case MODULE_STATE_COMING: + btf_mod = kzalloc(sizeof(*btf_mod), GFP_KERNEL); + if (!btf_mod) { + err = -ENOMEM; + goto out; + } + btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size); + if (IS_ERR(btf)) { + pr_warn("failed to validate module [%s] BTF: %ld\n", + mod->name, PTR_ERR(btf)); + kfree(btf_mod); + err = PTR_ERR(btf); + goto out; + } + err = btf_alloc_id(btf); + if (err) { + btf_free(btf); + kfree(btf_mod); + goto out; + } + + mutex_lock(&btf_module_mutex); + btf_mod->module = module; + btf_mod->btf = btf; + list_add(&btf_mod->list, &btf_modules); + mutex_unlock(&btf_module_mutex); + + if (IS_ENABLED(CONFIG_SYSFS)) { + struct bin_attribute *attr; + + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + if (!attr) + goto out; + + sysfs_bin_attr_init(attr); + attr->attr.name = btf->name; + attr->attr.mode = 0444; + attr->size = btf->data_size; + attr->private = btf; + attr->read = btf_module_read; + + err = sysfs_create_bin_file(btf_kobj, attr); + if (err) { + pr_warn("failed to register module [%s] BTF in sysfs: %d\n", + mod->name, err); + kfree(attr); + err = 0; + goto out; + } + + btf_mod->sysfs_attr = attr; + } + + break; + case MODULE_STATE_GOING: + mutex_lock(&btf_module_mutex); + list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { + if (btf_mod->module != module) + continue; + + list_del(&btf_mod->list); + if (btf_mod->sysfs_attr) + sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr); + btf_put(btf_mod->btf); + kfree(btf_mod->sysfs_attr); + kfree(btf_mod); + break; + } + mutex_unlock(&btf_module_mutex); + break; + } +out: + return notifier_from_errno(err); +} + +static struct notifier_block btf_module_nb = { + .notifier_call = btf_module_notify, +}; + +static int __init btf_module_init(void) +{ + register_module_notifier(&btf_module_nb); + return 0; +} + +fs_initcall(btf_module_init); +#endif /* CONFIG_DEBUG_INFO_BTF_MODULES */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9268d77898b7..261f8692d0d2 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -77,7 +77,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; + gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags; struct bpf_prog_aux *aux; struct bpf_prog *fp; @@ -86,7 +86,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag if (fp == NULL) return NULL; - aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); + aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT | gfp_extra_flags); if (aux == NULL) { vfree(fp); return NULL; @@ -106,7 +106,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; + gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *prog; int cpu; @@ -138,7 +138,7 @@ int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog) prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo, sizeof(*prog->aux->jited_linfo), - GFP_KERNEL | __GFP_NOWARN); + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!prog->aux->jited_linfo) return -ENOMEM; @@ -219,25 +219,17 @@ void bpf_prog_free_linfo(struct bpf_prog *prog) struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; + gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; - u32 pages, delta; - int ret; + u32 pages; size = round_up(size, PAGE_SIZE); pages = size / PAGE_SIZE; if (pages <= fp_old->pages) return fp_old; - delta = pages - fp_old->pages; - ret = __bpf_prog_charge(fp_old->aux->user, delta); - if (ret) - return NULL; - fp = __vmalloc(size, gfp_flags); - if (fp == NULL) { - __bpf_prog_uncharge(fp_old->aux->user, delta); - } else { + if (fp) { memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); fp->pages = pages; fp->aux->prog = fp; @@ -1369,7 +1361,7 @@ u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) * * Decode and execute eBPF instructions. */ -static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) +static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z @@ -2211,6 +2203,7 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; +const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index c61a23b564aa..747313698178 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -84,8 +84,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) u32 value_size = attr->value_size; struct bpf_cpu_map *cmap; int err = -ENOMEM; - u64 cost; - int ret; if (!bpf_capable()) return ERR_PTR(-EPERM); @@ -97,7 +95,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); - cmap = kzalloc(sizeof(*cmap), GFP_USER); + cmap = kzalloc(sizeof(*cmap), GFP_USER | __GFP_ACCOUNT); if (!cmap) return ERR_PTR(-ENOMEM); @@ -109,26 +107,14 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) goto free_cmap; } - /* make sure page count doesn't overflow */ - cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); - - /* Notice returns -EPERM on if map size is larger than memlock limit */ - ret = bpf_map_charge_init(&cmap->map.memory, cost); - if (ret) { - err = ret; - goto free_cmap; - } - /* Alloc array for possible remote "destination" CPUs */ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *), cmap->map.numa_node); if (!cmap->cpu_map) - goto free_charge; + goto free_cmap; return &cmap->map; -free_charge: - bpf_map_charge_finish(&cmap->map.memory); free_cmap: kfree(cmap); return ERR_PTR(err); @@ -412,7 +398,8 @@ static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd) } static struct bpf_cpu_map_entry * -__cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) +__cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, + u32 cpu) { int numa, err, i, fd = value->bpf_prog.fd; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; @@ -422,13 +409,13 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) /* Have map->numa_node, but choose node of redirect target CPU */ numa = cpu_to_node(cpu); - rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa); + rcpu = bpf_map_kmalloc_node(map, sizeof(*rcpu), gfp | __GFP_ZERO, numa); if (!rcpu) return NULL; /* Alloc percpu bulkq */ - rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq), - sizeof(void *), gfp); + rcpu->bulkq = bpf_map_alloc_percpu(map, sizeof(*rcpu->bulkq), + sizeof(void *), gfp); if (!rcpu->bulkq) goto free_rcu; @@ -438,7 +425,8 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) } /* Alloc queue */ - rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); + rcpu->queue = bpf_map_kmalloc_node(map, sizeof(*rcpu->queue), gfp, + numa); if (!rcpu->queue) goto free_bulkq; @@ -447,7 +435,7 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) goto free_queue; rcpu->cpu = cpu; - rcpu->map_id = map_id; + rcpu->map_id = map->id; rcpu->value.qsize = value->qsize; if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd)) @@ -455,7 +443,8 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) /* Setup kthread */ rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, - "cpumap/%d/map:%d", cpu, map_id); + "cpumap/%d/map:%d", cpu, + map->id); if (IS_ERR(rcpu->kthread)) goto free_prog; @@ -571,7 +560,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, rcpu = NULL; /* Same as deleting */ } else { /* Updating qsize cause re-allocation of bpf_cpu_map_entry */ - rcpu = __cpu_map_entry_alloc(&cpumap_value, key_cpu, map->id); + rcpu = __cpu_map_entry_alloc(map, &cpumap_value, key_cpu); if (!rcpu) return -ENOMEM; rcpu->cmap = cmap; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 2b5ca93c17de..f6e9c68afdd4 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -109,8 +109,6 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { u32 valsize = attr->value_size; - u64 cost = 0; - int err; /* check sanity of attributes. 2 value sizes supported: * 4 bytes: ifindex @@ -135,21 +133,13 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) if (!dtab->n_buckets) /* Overflow check */ return -EINVAL; - cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets; - } else { - cost += (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); } - /* if map size is larger than memlock limit, reject it */ - err = bpf_map_charge_init(&dtab->map.memory, cost); - if (err) - return -EINVAL; - if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets, dtab->map.numa_node); if (!dtab->dev_index_head) - goto free_charge; + return -ENOMEM; spin_lock_init(&dtab->index_lock); } else { @@ -157,14 +147,10 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) - goto free_charge; + return -ENOMEM; } return 0; - -free_charge: - bpf_map_charge_finish(&dtab->map.memory); - return -ENOMEM; } static struct bpf_map *dev_map_alloc(union bpf_attr *attr) @@ -175,7 +161,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); - dtab = kzalloc(sizeof(*dtab), GFP_USER); + dtab = kzalloc(sizeof(*dtab), GFP_USER | __GFP_ACCOUNT); if (!dtab) return ERR_PTR(-ENOMEM); @@ -602,8 +588,9 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_prog *prog = NULL; struct bpf_dtab_netdev *dev; - dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, - dtab->map.numa_node); + dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev), + GFP_ATOMIC | __GFP_NOWARN, + dtab->map.numa_node); if (!dev) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 1815e97d4c9c..7e848200cd26 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -86,6 +86,9 @@ struct bucket { }; }; +#define HASHTAB_MAP_LOCK_COUNT 8 +#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1) + struct bpf_htab { struct bpf_map map; struct bucket *buckets; @@ -99,6 +102,8 @@ struct bpf_htab { u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ u32 hashrnd; + struct lock_class_key lockdep_key; + int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT]; }; /* each htab element is struct htab_elem + key + value */ @@ -138,33 +143,53 @@ static void htab_init_buckets(struct bpf_htab *htab) for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); - if (htab_use_raw_lock(htab)) + if (htab_use_raw_lock(htab)) { raw_spin_lock_init(&htab->buckets[i].raw_lock); - else + lockdep_set_class(&htab->buckets[i].raw_lock, + &htab->lockdep_key); + } else { spin_lock_init(&htab->buckets[i].lock); + lockdep_set_class(&htab->buckets[i].lock, + &htab->lockdep_key); + } } } -static inline unsigned long htab_lock_bucket(const struct bpf_htab *htab, - struct bucket *b) +static inline int htab_lock_bucket(const struct bpf_htab *htab, + struct bucket *b, u32 hash, + unsigned long *pflags) { unsigned long flags; + hash = hash & HASHTAB_MAP_LOCK_MASK; + + migrate_disable(); + if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { + __this_cpu_dec(*(htab->map_locked[hash])); + migrate_enable(); + return -EBUSY; + } + if (htab_use_raw_lock(htab)) raw_spin_lock_irqsave(&b->raw_lock, flags); else spin_lock_irqsave(&b->lock, flags); - return flags; + *pflags = flags; + + return 0; } static inline void htab_unlock_bucket(const struct bpf_htab *htab, - struct bucket *b, + struct bucket *b, u32 hash, unsigned long flags) { + hash = hash & HASHTAB_MAP_LOCK_MASK; if (htab_use_raw_lock(htab)) raw_spin_unlock_irqrestore(&b->raw_lock, flags); else spin_unlock_irqrestore(&b->lock, flags); + __this_cpu_dec(*(htab->map_locked[hash])); + migrate_enable(); } static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); @@ -199,7 +224,7 @@ static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) { - return (struct htab_elem *) (htab->elems + i * htab->elem_size); + return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size); } static void htab_free_elems(struct bpf_htab *htab) @@ -255,7 +280,7 @@ static int prealloc_init(struct bpf_htab *htab) if (!htab_is_percpu(htab) && !htab_is_lru(htab)) num_entries += num_possible_cpus(); - htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries, + htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries, htab->map.numa_node); if (!htab->elems) return -ENOMEM; @@ -267,7 +292,8 @@ static int prealloc_init(struct bpf_htab *htab) u32 size = round_up(htab->map.value_size, 8); void __percpu *pptr; - pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN); + pptr = bpf_map_alloc_percpu(&htab->map, size, 8, + GFP_USER | __GFP_NOWARN); if (!pptr) goto free_elems; htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, @@ -321,8 +347,8 @@ static int alloc_extra_elems(struct bpf_htab *htab) struct pcpu_freelist_node *l; int cpu; - pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8, - GFP_USER | __GFP_NOWARN); + pptr = bpf_map_alloc_percpu(&htab->map, sizeof(struct htab_elem *), 8, + GFP_USER | __GFP_NOWARN); if (!pptr) return -ENOMEM; @@ -390,17 +416,11 @@ static int htab_map_alloc_check(union bpf_attr *attr) attr->value_size == 0) return -EINVAL; - if (attr->key_size > MAX_BPF_STACK) - /* eBPF programs initialize keys on stack, so they cannot be - * larger than max stack size - */ - return -E2BIG; - - if (attr->value_size >= KMALLOC_MAX_SIZE - - MAX_BPF_STACK - sizeof(struct htab_elem)) - /* if value_size is bigger, the user space won't be able to - * access the elements via bpf syscall. This check also makes - * sure that the elem_size doesn't overflow and it's + if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE - + sizeof(struct htab_elem)) + /* if key_size + value_size is bigger, the user space won't be + * able to access the elements via bpf syscall. This check + * also makes sure that the elem_size doesn't overflow and it's * kmalloc-able later in htab_map_update_elem() */ return -E2BIG; @@ -422,13 +442,14 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); struct bpf_htab *htab; - u64 cost; - int err; + int err, i; - htab = kzalloc(sizeof(*htab), GFP_USER); + htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT); if (!htab) return ERR_PTR(-ENOMEM); + lockdep_register_key(&htab->lockdep_key); + bpf_map_init_from_attr(&htab->map, attr); if (percpu_lru) { @@ -459,26 +480,21 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab->n_buckets > U32_MAX / sizeof(struct bucket)) goto free_htab; - cost = (u64) htab->n_buckets * sizeof(struct bucket) + - (u64) htab->elem_size * htab->map.max_entries; - - if (percpu) - cost += (u64) round_up(htab->map.value_size, 8) * - num_possible_cpus() * htab->map.max_entries; - else - cost += (u64) htab->elem_size * num_possible_cpus(); - - /* if map size is larger than memlock limit, reject it */ - err = bpf_map_charge_init(&htab->map.memory, cost); - if (err) - goto free_htab; - err = -ENOMEM; htab->buckets = bpf_map_area_alloc(htab->n_buckets * sizeof(struct bucket), htab->map.numa_node); if (!htab->buckets) - goto free_charge; + goto free_htab; + + for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { + htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map, + sizeof(int), + sizeof(int), + GFP_USER); + if (!htab->map_locked[i]) + goto free_map_locked; + } if (htab->map.map_flags & BPF_F_ZERO_SEED) htab->hashrnd = 0; @@ -490,7 +506,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (prealloc) { err = prealloc_init(htab); if (err) - goto free_buckets; + goto free_map_locked; if (!percpu && !lru) { /* lru itself can remove the least used element, so @@ -506,11 +522,12 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) free_prealloc: prealloc_destroy(htab); -free_buckets: +free_map_locked: + for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) + free_percpu(htab->map_locked[i]); bpf_map_area_free(htab->buckets); -free_charge: - bpf_map_charge_finish(&htab->map.memory); free_htab: + lockdep_unregister_key(&htab->lockdep_key); kfree(htab); return ERR_PTR(err); } @@ -687,12 +704,15 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) struct hlist_nulls_node *n; unsigned long flags; struct bucket *b; + int ret; tgt_l = container_of(node, struct htab_elem, lru_node); b = __select_bucket(htab, tgt_l->hash); head = &b->head; - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags); + if (ret) + return false; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l == tgt_l) { @@ -700,7 +720,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) break; } - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, tgt_l->hash, flags); return l == tgt_l; } @@ -821,6 +841,32 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, } } +static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, + void *value, bool onallcpus) +{ + /* When using prealloc and not setting the initial value on all cpus, + * zero-fill element values for other cpus (just as what happens when + * not using prealloc). Otherwise, bpf program has no way to ensure + * known initial values for cpus other than current one + * (onallcpus=false always when coming from bpf prog). + */ + if (htab_is_prealloc(htab) && !onallcpus) { + u32 size = round_up(htab->map.value_size, 8); + int current_cpu = raw_smp_processor_id(); + int cpu; + + for_each_possible_cpu(cpu) { + if (cpu == current_cpu) + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value, + size); + else + memset(per_cpu_ptr(pptr, cpu), 0, size); + } + } else { + pcpu_copy_value(htab, pptr, value, onallcpus); + } +} + static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) { return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && @@ -865,8 +911,9 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new = ERR_PTR(-E2BIG); goto dec_count; } - l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, - htab->map.numa_node); + l_new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, + GFP_ATOMIC | __GFP_NOWARN, + htab->map.numa_node); if (!l_new) { l_new = ERR_PTR(-ENOMEM); goto dec_count; @@ -882,8 +929,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, pptr = htab_elem_get_ptr(l_new, key_size); } else { /* alloc_percpu zero-fills */ - pptr = __alloc_percpu_gfp(size, 8, - GFP_ATOMIC | __GFP_NOWARN); + pptr = bpf_map_alloc_percpu(&htab->map, size, 8, + GFP_ATOMIC | __GFP_NOWARN); if (!pptr) { kfree(l_new); l_new = ERR_PTR(-ENOMEM); @@ -891,7 +938,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, } } - pcpu_copy_value(htab, pptr, value, onallcpus); + pcpu_init_value(htab, pptr, value, onallcpus); if (!prealloc) htab_elem_set_ptr(l_new, key_size, pptr); @@ -972,7 +1019,9 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, */ } - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1013,7 +1062,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, } ret = 0; err: - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); return ret; } @@ -1051,7 +1100,9 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, return -ENOMEM; memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1070,7 +1121,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, ret = 0; err: - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); if (ret) bpf_lru_push_free(&htab->lru, &l_new->lru_node); @@ -1105,7 +1156,9 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, b = __select_bucket(htab, hash); head = &b->head; - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1128,7 +1181,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, } ret = 0; err: - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); return ret; } @@ -1168,7 +1221,9 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, return -ENOMEM; } - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1183,14 +1238,14 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), value, onallcpus); } else { - pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), + pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size), value, onallcpus); hlist_nulls_add_head_rcu(&l_new->hash_node, head); l_new = NULL; } ret = 0; err: - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); if (l_new) bpf_lru_push_free(&htab->lru, &l_new->lru_node); return ret; @@ -1218,7 +1273,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) struct htab_elem *l; unsigned long flags; u32 hash, key_size; - int ret = -ENOENT; + int ret; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); @@ -1228,17 +1283,20 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l = lookup_elem_raw(head, hash, key, key_size); if (l) { hlist_nulls_del_rcu(&l->hash_node); free_htab_elem(htab, l); - ret = 0; + } else { + ret = -ENOENT; } - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); return ret; } @@ -1250,7 +1308,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) struct htab_elem *l; unsigned long flags; u32 hash, key_size; - int ret = -ENOENT; + int ret; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); @@ -1260,16 +1318,18 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l = lookup_elem_raw(head, hash, key, key_size); - if (l) { + if (l) hlist_nulls_del_rcu(&l->hash_node); - ret = 0; - } + else + ret = -ENOENT; - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); if (l) bpf_lru_push_free(&htab->lru, &l->lru_node); return ret; @@ -1295,6 +1355,7 @@ static void delete_all_elements(struct bpf_htab *htab) static void htab_map_free(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + int i; /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. * bpf_free_used_maps() is called after bpf prog is no longer executing. @@ -1312,6 +1373,9 @@ static void htab_map_free(struct bpf_map *map) free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); + for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) + free_percpu(htab->map_locked[i]); + lockdep_unregister_key(&htab->lockdep_key); kfree(htab); } @@ -1348,7 +1412,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; void __user *uvalues = u64_to_user_ptr(attr->batch.values); void __user *ukeys = u64_to_user_ptr(attr->batch.keys); - void *ubatch = u64_to_user_ptr(attr->batch.in_batch); + void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); u32 batch, max_count, size, bucket_size; struct htab_elem *node_to_free = NULL; u64 elem_map_flags, map_flags; @@ -1415,8 +1479,11 @@ again_nocopy: b = &htab->buckets[batch]; head = &b->head; /* do not grab the lock unless need it (bucket_cnt > 0). */ - if (locked) - flags = htab_lock_bucket(htab, b); + if (locked) { + ret = htab_lock_bucket(htab, b, batch, &flags); + if (ret) + goto next_batch; + } bucket_cnt = 0; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) @@ -1433,7 +1500,7 @@ again_nocopy: /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, batch, flags); rcu_read_unlock(); bpf_enable_instrumentation(); goto after_loop; @@ -1444,7 +1511,7 @@ again_nocopy: /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, batch, flags); rcu_read_unlock(); bpf_enable_instrumentation(); kvfree(keys); @@ -1497,7 +1564,7 @@ again_nocopy: dst_val += value_size; } - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, batch, flags); locked = false; while (node_to_free) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 25520f5eeaf6..bd8a3183d030 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -167,6 +167,17 @@ const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_0(bpf_ktime_get_coarse_ns) +{ + return ktime_get_coarse_ns(); +} + +const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = { + .func = bpf_ktime_get_coarse_ns, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + BPF_CALL_0(bpf_get_current_pid_tgid) { struct task_struct *task = current; @@ -685,6 +696,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; case BPF_FUNC_ringbuf_output: return &bpf_ringbuf_output_proto; case BPF_FUNC_ringbuf_reserve: @@ -717,9 +730,9 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_snprintf_btf_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; - case BPF_FUNC_bpf_per_cpu_ptr: + case BPF_FUNC_per_cpu_ptr: return &bpf_per_cpu_ptr_proto; - case BPF_FUNC_bpf_this_cpu_ptr: + case BPF_FUNC_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; default: break; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 571bb351ed3b..2d4f9ac12377 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -164,10 +164,10 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key, return 0; } - new = kmalloc_node(sizeof(struct bpf_storage_buffer) + - map->value_size, - __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, - map->numa_node); + new = bpf_map_kmalloc_node(map, sizeof(struct bpf_storage_buffer) + + map->value_size, + __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, + map->numa_node); if (!new) return -ENOMEM; @@ -287,8 +287,6 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct bpf_cgroup_storage_map *map; - struct bpf_map_memory mem; - int ret; if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) && attr->key_size != sizeof(__u64)) @@ -308,18 +306,10 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) /* max_entries is not used and enforced to be 0 */ return ERR_PTR(-EINVAL); - ret = bpf_map_charge_init(&mem, sizeof(struct bpf_cgroup_storage_map)); - if (ret < 0) - return ERR_PTR(ret); - map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), - __GFP_ZERO | GFP_USER, numa_node); - if (!map) { - bpf_map_charge_finish(&mem); + __GFP_ZERO | GFP_USER | __GFP_ACCOUNT, numa_node); + if (!map) return ERR_PTR(-ENOMEM); - } - - bpf_map_charge_move(&map->map.memory, &mem); /* copy mandatory map attributes */ bpf_map_init_from_attr(&map->map, attr); @@ -496,9 +486,9 @@ static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages) struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { + const gfp_t gfp = __GFP_ZERO | GFP_USER; struct bpf_cgroup_storage *storage; struct bpf_map *map; - gfp_t flags; size_t size; u32 pages; @@ -508,23 +498,19 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, size = bpf_cgroup_storage_calculate_size(map, &pages); - if (bpf_map_charge_memlock(map, pages)) - return ERR_PTR(-EPERM); - - storage = kmalloc_node(sizeof(struct bpf_cgroup_storage), - __GFP_ZERO | GFP_USER, map->numa_node); + storage = bpf_map_kmalloc_node(map, sizeof(struct bpf_cgroup_storage), + gfp, map->numa_node); if (!storage) goto enomem; - flags = __GFP_ZERO | GFP_USER; - if (stype == BPF_CGROUP_STORAGE_SHARED) { - storage->buf = kmalloc_node(size, flags, map->numa_node); + storage->buf = bpf_map_kmalloc_node(map, size, gfp, + map->numa_node); if (!storage->buf) goto enomem; check_and_init_map_lock(map, storage->buf->data); } else { - storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags); + storage->percpu_buf = bpf_map_alloc_percpu(map, size, 8, gfp); if (!storage->percpu_buf) goto enomem; } @@ -534,7 +520,6 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, return storage; enomem: - bpf_map_uncharge_memlock(map, pages); kfree(storage); return ERR_PTR(-ENOMEM); } @@ -561,16 +546,11 @@ void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage) { enum bpf_cgroup_storage_type stype; struct bpf_map *map; - u32 pages; if (!storage) return; map = &storage->map->map; - - bpf_cgroup_storage_calculate_size(map, &pages); - bpf_map_uncharge_memlock(map, pages); - stype = cgroup_storage_type(map); if (stype == BPF_CGROUP_STORAGE_SHARED) call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu); diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 00e32f2ec3e6..cec792a17e5f 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -282,8 +282,8 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, if (value) size += trie->map.value_size; - node = kmalloc_node(size, GFP_ATOMIC | __GFP_NOWARN, - trie->map.numa_node); + node = bpf_map_kmalloc_node(&trie->map, size, GFP_ATOMIC | __GFP_NOWARN, + trie->map.numa_node); if (!node) return NULL; @@ -540,8 +540,6 @@ out: static struct bpf_map *trie_alloc(union bpf_attr *attr) { struct lpm_trie *trie; - u64 cost = sizeof(*trie), cost_per_node; - int ret; if (!bpf_capable()) return ERR_PTR(-EPERM); @@ -557,7 +555,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) attr->value_size > LPM_VAL_SIZE_MAX) return ERR_PTR(-EINVAL); - trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN); + trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); if (!trie) return ERR_PTR(-ENOMEM); @@ -567,20 +565,9 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) offsetof(struct bpf_lpm_trie_key, data); trie->max_prefixlen = trie->data_size * 8; - cost_per_node = sizeof(struct lpm_trie_node) + - attr->value_size + trie->data_size; - cost += (u64) attr->max_entries * cost_per_node; - - ret = bpf_map_charge_init(&trie->map.memory, cost); - if (ret) - goto out_err; - spin_lock_init(&trie->lock); return &trie->map; -out_err: - kfree(trie); - return ERR_PTR(ret); } static void trie_free(struct bpf_map *map) diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig index ace49111d3a3..26bced262473 100644 --- a/kernel/bpf/preload/Kconfig +++ b/kernel/bpf/preload/Kconfig @@ -6,6 +6,7 @@ config USERMODE_DRIVER menuconfig BPF_PRELOAD bool "Preload BPF file system with kernel specific program and map iterators" depends on BPF + depends on BPF_SYSCALL # The dependency on !COMPILE_TEST prevents it from being enabled # in allmodconfig or allyesconfig configurations depends on !COMPILE_TEST diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 0ee2347ba510..f9c734aaa990 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -66,29 +66,21 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr) static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) { - int ret, numa_node = bpf_map_attr_numa_node(attr); - struct bpf_map_memory mem = {0}; + int numa_node = bpf_map_attr_numa_node(attr); struct bpf_queue_stack *qs; - u64 size, queue_size, cost; + u64 size, queue_size; size = (u64) attr->max_entries + 1; - cost = queue_size = sizeof(*qs) + size * attr->value_size; - - ret = bpf_map_charge_init(&mem, cost); - if (ret < 0) - return ERR_PTR(ret); + queue_size = sizeof(*qs) + size * attr->value_size; qs = bpf_map_area_alloc(queue_size, numa_node); - if (!qs) { - bpf_map_charge_finish(&mem); + if (!qs) return ERR_PTR(-ENOMEM); - } memset(qs, 0, sizeof(*qs)); bpf_map_init_from_attr(&qs->map, attr); - bpf_map_charge_move(&qs->map.memory, &mem); qs->size = size; raw_spin_lock_init(&qs->lock); diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index a55cd542f2ce..4838922f723d 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -150,9 +150,8 @@ static void reuseport_array_free(struct bpf_map *map) static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) { - int err, numa_node = bpf_map_attr_numa_node(attr); + int numa_node = bpf_map_attr_numa_node(attr); struct reuseport_array *array; - struct bpf_map_memory mem; u64 array_size; if (!bpf_capable()) @@ -161,20 +160,13 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) array_size = sizeof(*array); array_size += (u64)attr->max_entries * sizeof(struct sock *); - err = bpf_map_charge_init(&mem, array_size); - if (err) - return ERR_PTR(err); - /* allocate all map elements and zero-initialize them */ array = bpf_map_area_alloc(array_size, numa_node); - if (!array) { - bpf_map_charge_finish(&mem); + if (!array) return ERR_PTR(-ENOMEM); - } /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); - bpf_map_charge_move(&array->map.memory, &mem); return &array->map; } diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 31cb04a4dd2d..f25b719ac786 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -48,7 +48,6 @@ struct bpf_ringbuf { struct bpf_ringbuf_map { struct bpf_map map; - struct bpf_map_memory memory; struct bpf_ringbuf *rb; }; @@ -60,8 +59,8 @@ struct bpf_ringbuf_hdr { static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) { - const gfp_t flags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | - __GFP_ZERO; + const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL | + __GFP_NOWARN | __GFP_ZERO; int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES; int nr_data_pages = data_sz >> PAGE_SHIFT; int nr_pages = nr_meta_pages + nr_data_pages; @@ -88,10 +87,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) * user-space implementations significantly. */ array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages); - if (array_size > PAGE_SIZE) - pages = vmalloc_node(array_size, numa_node); - else - pages = kmalloc_node(array_size, flags, numa_node); + pages = bpf_map_area_alloc(array_size, numa_node); if (!pages) return NULL; @@ -134,7 +130,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) rb = bpf_ringbuf_area_alloc(data_sz, numa_node); if (!rb) - return ERR_PTR(-ENOMEM); + return NULL; spin_lock_init(&rb->spinlock); init_waitqueue_head(&rb->waitq); @@ -150,8 +146,6 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) { struct bpf_ringbuf_map *rb_map; - u64 cost; - int err; if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); @@ -167,32 +161,19 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) return ERR_PTR(-E2BIG); #endif - rb_map = kzalloc(sizeof(*rb_map), GFP_USER); + rb_map = kzalloc(sizeof(*rb_map), GFP_USER | __GFP_ACCOUNT); if (!rb_map) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&rb_map->map, attr); - cost = sizeof(struct bpf_ringbuf_map) + - sizeof(struct bpf_ringbuf) + - attr->max_entries; - err = bpf_map_charge_init(&rb_map->map.memory, cost); - if (err) - goto err_free_map; - rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); - if (IS_ERR(rb_map->rb)) { - err = PTR_ERR(rb_map->rb); - goto err_uncharge; + if (!rb_map->rb) { + kfree(rb_map); + return ERR_PTR(-ENOMEM); } return &rb_map->map; - -err_uncharge: - bpf_map_charge_finish(&rb_map->map.memory); -err_free_map: - kfree(rb_map); - return ERR_PTR(err); } static void bpf_ringbuf_free(struct bpf_ringbuf *rb) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 06065fa27124..aea96b638473 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -90,7 +90,6 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) { u32 value_size = attr->value_size; struct bpf_stack_map *smap; - struct bpf_map_memory mem; u64 cost, n_buckets; int err; @@ -119,15 +118,9 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); - err = bpf_map_charge_init(&mem, cost); - if (err) - return ERR_PTR(err); - smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); - if (!smap) { - bpf_map_charge_finish(&mem); + if (!smap) return ERR_PTR(-ENOMEM); - } bpf_map_init_from_attr(&smap->map, attr); smap->map.value_size = value_size; @@ -135,20 +128,17 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) err = get_callchain_buffers(sysctl_perf_event_max_stack); if (err) - goto free_charge; + goto free_smap; err = prealloc_elems_and_freelist(smap); if (err) goto put_buffers; - bpf_map_charge_move(&smap->map.memory, &mem); - return &smap->map; put_buffers: put_callchain_buffers(); -free_charge: - bpf_map_charge_finish(&mem); +free_smap: bpf_map_area_free(smap); return ERR_PTR(err); } @@ -298,7 +288,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, if (irqs_disabled()) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { work = this_cpu_ptr(&up_read_work); - if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) { + if (irq_work_is_busy(&work->irq_work)) { /* cannot queue more up_read, fallback */ irq_work_busy = true; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6d49c2e1634c..4caf06fe4152 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -31,6 +31,7 @@ #include <linux/poll.h> #include <linux/bpf-netns.h> #include <linux/rcupdate_trace.h> +#include <linux/memcontrol.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -127,7 +128,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) return map; } -static u32 bpf_map_value_size(struct bpf_map *map) +static u32 bpf_map_value_size(const struct bpf_map *map) { if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || @@ -267,6 +268,10 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, return err; } +/* Please, do not use this function outside from the map creation path + * (e.g. in map update path) without taking care of setting the active + * memory cgroup (see at bpf_map_kmalloc_node() for example). + */ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) { /* We really just want to fail instead of triggering OOM killer @@ -279,7 +284,7 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) * __GFP_RETRY_MAYFAIL to avoid such situations. */ - const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO; + const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_ACCOUNT; unsigned int flags = 0; unsigned long align = 1; void *area; @@ -341,77 +346,6 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) map->numa_node = bpf_map_attr_numa_node(attr); } -static int bpf_charge_memlock(struct user_struct *user, u32 pages) -{ - unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - - if (atomic_long_add_return(pages, &user->locked_vm) > memlock_limit) { - atomic_long_sub(pages, &user->locked_vm); - return -EPERM; - } - return 0; -} - -static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) -{ - if (user) - atomic_long_sub(pages, &user->locked_vm); -} - -int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size) -{ - u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT; - struct user_struct *user; - int ret; - - if (size >= U32_MAX - PAGE_SIZE) - return -E2BIG; - - user = get_current_user(); - ret = bpf_charge_memlock(user, pages); - if (ret) { - free_uid(user); - return ret; - } - - mem->pages = pages; - mem->user = user; - - return 0; -} - -void bpf_map_charge_finish(struct bpf_map_memory *mem) -{ - bpf_uncharge_memlock(mem->user, mem->pages); - free_uid(mem->user); -} - -void bpf_map_charge_move(struct bpf_map_memory *dst, - struct bpf_map_memory *src) -{ - *dst = *src; - - /* Make sure src will not be used for the redundant uncharging. */ - memset(src, 0, sizeof(struct bpf_map_memory)); -} - -int bpf_map_charge_memlock(struct bpf_map *map, u32 pages) -{ - int ret; - - ret = bpf_charge_memlock(map->memory.user, pages); - if (ret) - return ret; - map->memory.pages += pages; - return ret; -} - -void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages) -{ - bpf_uncharge_memlock(map->memory.user, pages); - map->memory.pages -= pages; -} - static int bpf_map_alloc_id(struct bpf_map *map) { int id; @@ -456,17 +390,74 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) __release(&map_idr_lock); } +#ifdef CONFIG_MEMCG_KMEM +static void bpf_map_save_memcg(struct bpf_map *map) +{ + map->memcg = get_mem_cgroup_from_mm(current->mm); +} + +static void bpf_map_release_memcg(struct bpf_map *map) +{ + mem_cgroup_put(map->memcg); +} + +void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, + int node) +{ + struct mem_cgroup *old_memcg; + void *ptr; + + old_memcg = set_active_memcg(map->memcg); + ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node); + set_active_memcg(old_memcg); + + return ptr; +} + +void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) +{ + struct mem_cgroup *old_memcg; + void *ptr; + + old_memcg = set_active_memcg(map->memcg); + ptr = kzalloc(size, flags | __GFP_ACCOUNT); + set_active_memcg(old_memcg); + + return ptr; +} + +void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, + size_t align, gfp_t flags) +{ + struct mem_cgroup *old_memcg; + void __percpu *ptr; + + old_memcg = set_active_memcg(map->memcg); + ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT); + set_active_memcg(old_memcg); + + return ptr; +} + +#else +static void bpf_map_save_memcg(struct bpf_map *map) +{ +} + +static void bpf_map_release_memcg(struct bpf_map *map) +{ +} +#endif + /* called from workqueue */ static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); - struct bpf_map_memory mem; - bpf_map_charge_move(&mem, &map->memory); security_bpf_map_free(map); + bpf_map_release_memcg(map); /* implementation dependent freeing */ map->ops->map_free(map); - bpf_map_charge_finish(&mem); } static void bpf_map_put_uref(struct bpf_map *map) @@ -527,6 +518,19 @@ static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f) } #ifdef CONFIG_PROC_FS +/* Provides an approximation of the map's memory footprint. + * Used only to provide a backward compatibility and display + * a reasonable "memlock" info. + */ +static unsigned long bpf_map_memory_footprint(const struct bpf_map *map) +{ + unsigned long size; + + size = round_up(map->key_size + bpf_map_value_size(map), 8); + + return round_up(map->max_entries * size, PAGE_SIZE); +} + static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_map *map = filp->private_data; @@ -545,7 +549,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) "value_size:\t%u\n" "max_entries:\t%u\n" "map_flags:\t%#x\n" - "memlock:\t%llu\n" + "memlock:\t%lu\n" "map_id:\t%u\n" "frozen:\t%u\n", map->map_type, @@ -553,7 +557,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) map->value_size, map->max_entries, map->map_flags, - map->memory.pages * 1ULL << PAGE_SHIFT, + bpf_map_memory_footprint(map), map->id, READ_ONCE(map->frozen)); if (type) { @@ -773,7 +777,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && map->map_type != BPF_MAP_TYPE_SK_STORAGE && - map->map_type != BPF_MAP_TYPE_INODE_STORAGE) + map->map_type != BPF_MAP_TYPE_INODE_STORAGE && + map->map_type != BPF_MAP_TYPE_TASK_STORAGE) return -ENOTSUPP; if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > map->value_size) { @@ -795,7 +800,6 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, static int map_create(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); - struct bpf_map_memory mem; struct bpf_map *map; int f_flags; int err; @@ -874,6 +878,8 @@ static int map_create(union bpf_attr *attr) if (err) goto free_map_sec; + bpf_map_save_memcg(map); + err = bpf_map_new_fd(map, f_flags); if (err < 0) { /* failed to allocate fd. @@ -892,9 +898,7 @@ free_map_sec: security_bpf_map_free(map); free_map: btf_put(map->btf); - bpf_map_charge_move(&mem, &map->memory); map->ops->map_free(map); - bpf_map_charge_finish(&mem); return err; } @@ -1628,51 +1632,6 @@ static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op) audit_log_end(ab); } -int __bpf_prog_charge(struct user_struct *user, u32 pages) -{ - unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - unsigned long user_bufs; - - if (user) { - user_bufs = atomic_long_add_return(pages, &user->locked_vm); - if (user_bufs > memlock_limit) { - atomic_long_sub(pages, &user->locked_vm); - return -EPERM; - } - } - - return 0; -} - -void __bpf_prog_uncharge(struct user_struct *user, u32 pages) -{ - if (user) - atomic_long_sub(pages, &user->locked_vm); -} - -static int bpf_prog_charge_memlock(struct bpf_prog *prog) -{ - struct user_struct *user = get_current_user(); - int ret; - - ret = __bpf_prog_charge(user, prog->pages); - if (ret) { - free_uid(user); - return ret; - } - - prog->aux->user = user; - return 0; -} - -static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) -{ - struct user_struct *user = prog->aux->user; - - __bpf_prog_uncharge(user, prog->pages); - free_uid(user); -} - static int bpf_prog_alloc_id(struct bpf_prog *prog) { int id; @@ -1722,7 +1681,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) kvfree(aux->func_info); kfree(aux->func_info_aux); - bpf_prog_uncharge_memlock(aux->prog); + free_uid(aux->user); security_bpf_prog_free(aux); bpf_prog_free(aux->prog); } @@ -1732,6 +1691,8 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) bpf_prog_kallsyms_del_all(prog); btf_put(prog->aux->btf); bpf_prog_free_linfo(prog); + if (prog->aux->attach_btf) + btf_put(prog->aux->attach_btf); if (deferred) { if (prog->aux->sleepable) @@ -1965,12 +1926,16 @@ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) static int bpf_prog_load_check_attach(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type, - u32 btf_id, u32 prog_fd) + struct btf *attach_btf, u32 btf_id, + struct bpf_prog *dst_prog) { if (btf_id) { if (btf_id > BTF_MAX_TYPE) return -EINVAL; + if (!attach_btf && !dst_prog) + return -EINVAL; + switch (prog_type) { case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_LSM: @@ -1982,7 +1947,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, } } - if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING && + if (attach_btf && (!btf_id || dst_prog)) + return -EINVAL; + + if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING && prog_type != BPF_PROG_TYPE_EXT) return -EINVAL; @@ -2099,7 +2067,8 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) { enum bpf_prog_type type = attr->prog_type; - struct bpf_prog *prog; + struct bpf_prog *prog, *dst_prog = NULL; + struct btf *attach_btf = NULL; int err; char license[128]; bool is_gpl; @@ -2141,47 +2110,73 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (is_perfmon_prog_type(type) && !perfmon_capable()) return -EPERM; + /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog + * or btf, we need to check which one it is + */ + if (attr->attach_prog_fd) { + dst_prog = bpf_prog_get(attr->attach_prog_fd); + if (IS_ERR(dst_prog)) { + dst_prog = NULL; + attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd); + if (IS_ERR(attach_btf)) + return -EINVAL; + if (!btf_is_kernel(attach_btf)) { + /* attaching through specifying bpf_prog's BTF + * objects directly might be supported eventually + */ + btf_put(attach_btf); + return -ENOTSUPP; + } + } + } else if (attr->attach_btf_id) { + /* fall back to vmlinux BTF, if BTF type ID is specified */ + attach_btf = bpf_get_btf_vmlinux(); + if (IS_ERR(attach_btf)) + return PTR_ERR(attach_btf); + if (!attach_btf) + return -EINVAL; + btf_get(attach_btf); + } + bpf_prog_load_fixup_attach_type(attr); if (bpf_prog_load_check_attach(type, attr->expected_attach_type, - attr->attach_btf_id, - attr->attach_prog_fd)) + attach_btf, attr->attach_btf_id, + dst_prog)) { + if (dst_prog) + bpf_prog_put(dst_prog); + if (attach_btf) + btf_put(attach_btf); return -EINVAL; + } /* plain bpf_prog allocation */ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); - if (!prog) + if (!prog) { + if (dst_prog) + bpf_prog_put(dst_prog); + if (attach_btf) + btf_put(attach_btf); return -ENOMEM; + } prog->expected_attach_type = attr->expected_attach_type; + prog->aux->attach_btf = attach_btf; prog->aux->attach_btf_id = attr->attach_btf_id; - if (attr->attach_prog_fd) { - struct bpf_prog *dst_prog; - - dst_prog = bpf_prog_get(attr->attach_prog_fd); - if (IS_ERR(dst_prog)) { - err = PTR_ERR(dst_prog); - goto free_prog_nouncharge; - } - prog->aux->dst_prog = dst_prog; - } - + prog->aux->dst_prog = dst_prog; prog->aux->offload_requested = !!attr->prog_ifindex; prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; err = security_bpf_prog_alloc(prog->aux); if (err) - goto free_prog_nouncharge; - - err = bpf_prog_charge_memlock(prog); - if (err) - goto free_prog_sec; + goto free_prog; + prog->aux->user = get_current_user(); prog->len = attr->insn_cnt; err = -EFAULT; if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns), bpf_prog_insn_size(prog)) != 0) - goto free_prog; + goto free_prog_sec; prog->orig_prog = NULL; prog->jited = 0; @@ -2192,19 +2187,19 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (bpf_prog_is_dev_bound(prog->aux)) { err = bpf_prog_offload_init(prog, attr); if (err) - goto free_prog; + goto free_prog_sec; } /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); if (err < 0) - goto free_prog; + goto free_prog_sec; prog->aux->load_time = ktime_get_boottime_ns(); err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name, sizeof(attr->prog_name)); if (err < 0) - goto free_prog; + goto free_prog_sec; /* run eBPF verifier */ err = bpf_check(&prog, attr, uattr); @@ -2249,11 +2244,12 @@ free_used_maps: */ __bpf_prog_put_noref(prog, prog->aux->func_cnt); return err; -free_prog: - bpf_prog_uncharge_memlock(prog); free_prog_sec: + free_uid(prog->aux->user); security_bpf_prog_free(prog->aux); -free_prog_nouncharge: +free_prog: + if (prog->aux->attach_btf) + btf_put(prog->aux->attach_btf); bpf_prog_free(prog); return err; } @@ -2611,7 +2607,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, goto out_put_prog; } - key = bpf_trampoline_compute_key(tgt_prog, btf_id); + key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id); } link = kzalloc(sizeof(*link), GFP_USER); @@ -3588,7 +3584,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, } if (prog->aux->btf) - info.btf_id = btf_id(prog->aux->btf); + info.btf_id = btf_obj_id(prog->aux->btf); ulen = info.nr_func_info; info.nr_func_info = prog->aux->func_info_cnt; @@ -3691,7 +3687,7 @@ static int bpf_map_get_info_by_fd(struct file *file, memcpy(info.name, map->name, sizeof(map->name)); if (map->btf) { - info.btf_id = btf_id(map->btf); + info.btf_id = btf_obj_id(map->btf); info.btf_key_type_id = map->btf_key_type_id; info.btf_value_type_id = map->btf_value_type_id; } diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 11b3380887fa..ef6911aee3bb 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -26,7 +26,7 @@ static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = { .read = btf_vmlinux_read, }; -static struct kobject *btf_kobj; +struct kobject *btf_kobj; static int __init btf_vmlinux_init(void) { diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 4ec63170c741..e73c07593024 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -135,8 +135,7 @@ struct bpf_iter_seq_task_file_info { }; static struct file * -task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info, - struct task_struct **task) +task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) { struct pid_namespace *ns = info->common.ns; u32 curr_tid = info->tid; @@ -148,16 +147,18 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info, * Otherwise, it does not hold any reference. */ again: - if (*task) { - curr_task = *task; + if (info->task) { + curr_task = info->task; curr_fd = info->fd; } else { curr_task = task_seq_get_next(ns, &curr_tid, true); - if (!curr_task) + if (!curr_task) { + info->task = NULL; return NULL; + } - /* set *task and info->tid */ - *task = curr_task; + /* set info->task and info->tid */ + info->task = curr_task; if (curr_tid == info->tid) { curr_fd = info->fd; } else { @@ -184,7 +185,7 @@ again: /* the current task is done, go to the next task */ rcu_read_unlock(); put_task_struct(curr_task); - *task = NULL; + info->task = NULL; info->fd = 0; curr_tid = ++(info->tid); goto again; @@ -193,18 +194,12 @@ again: static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) { struct bpf_iter_seq_task_file_info *info = seq->private; - struct task_struct *task = NULL; struct file *file; - file = task_file_seq_get_next(info, &task); - if (!file) { - info->task = NULL; - return NULL; - } - - if (*pos == 0) + info->task = NULL; + file = task_file_seq_get_next(info); + if (file && *pos == 0) ++*pos; - info->task = task; return file; } @@ -212,21 +207,11 @@ static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bpf_iter_seq_task_file_info *info = seq->private; - struct task_struct *task = info->task; - struct file *file; ++*pos; ++info->fd; fput((struct file *)v); - file = task_file_seq_get_next(info, &task); - if (!file) { - info->task = NULL; - return NULL; - } - - info->task = task; - - return file; + return task_file_seq_get_next(info); } struct bpf_iter__task_file { @@ -313,6 +298,7 @@ static const struct bpf_iter_seq_info task_seq_info = { static struct bpf_iter_reg task_reg_info = { .target = "task", + .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__task, task), @@ -330,6 +316,7 @@ static const struct bpf_iter_seq_info task_file_seq_info = { static struct bpf_iter_reg task_file_reg_info = { .target = "task_file", + .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__task_file, task), diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6200519582a6..17270b8404f1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -238,7 +238,9 @@ struct bpf_call_arg_meta { u64 msize_max_value; int ref_obj_id; int func_id; + struct btf *btf; u32 btf_id; + struct btf *ret_btf; u32 ret_btf_id; }; @@ -556,10 +558,9 @@ static struct bpf_func_state *func(struct bpf_verifier_env *env, return cur->frame[reg->frameno]; } -const char *kernel_type_name(u32 id) +static const char *kernel_type_name(const struct btf* btf, u32 id) { - return btf_name_by_offset(btf_vmlinux, - btf_type_by_id(btf_vmlinux, id)->name_off); + return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); } static void print_verifier_state(struct bpf_verifier_env *env, @@ -589,7 +590,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL || t == PTR_TO_PERCPU_BTF_ID) - verbose(env, "%s", kernel_type_name(reg->btf_id)); + verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id)); verbose(env, "(id=%d", reg->id); if (reg_type_may_be_refcounted_or_null(t)) verbose(env, ",ref_obj_id=%d", reg->ref_obj_id); @@ -1298,9 +1299,7 @@ static void __reg_combine_32_into_64(struct bpf_reg_state *reg) static bool __reg64_bound_s32(s64 a) { - if (a > S32_MIN && a < S32_MAX) - return true; - return false; + return a > S32_MIN && a < S32_MAX; } static bool __reg64_bound_u32(u64 a) @@ -1314,10 +1313,10 @@ static void __reg_combine_64_into_32(struct bpf_reg_state *reg) { __mark_reg32_unbounded(reg); - if (__reg64_bound_s32(reg->smin_value)) + if (__reg64_bound_s32(reg->smin_value) && __reg64_bound_s32(reg->smax_value)) { reg->s32_min_value = (s32)reg->smin_value; - if (__reg64_bound_s32(reg->smax_value)) reg->s32_max_value = (s32)reg->smax_value; + } if (__reg64_bound_u32(reg->umin_value)) reg->u32_min_value = (u32)reg->umin_value; if (__reg64_bound_u32(reg->umax_value)) @@ -1383,7 +1382,8 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, static void mark_btf_ld_reg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno, - enum bpf_reg_type reg_type, u32 btf_id) + enum bpf_reg_type reg_type, + struct btf *btf, u32 btf_id) { if (reg_type == SCALAR_VALUE) { mark_reg_unknown(env, regs, regno); @@ -1391,6 +1391,7 @@ static void mark_btf_ld_reg(struct bpf_verifier_env *env, } mark_reg_known_zero(env, regs, regno); regs[regno].type = PTR_TO_BTF_ID; + regs[regno].btf = btf; regs[regno].btf_id = btf_id; } @@ -2739,7 +2740,9 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, regno); return -EACCES; } - err = __check_mem_access(env, regno, off, size, reg->range, + + err = reg->range < 0 ? -EINVAL : + __check_mem_access(env, regno, off, size, reg->range, zero_size_allowed); if (err) { verbose(env, "R%d offset is outside of the packet\n", regno); @@ -2762,7 +2765,7 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, /* check access to 'struct bpf_context' fields. Supports fixed offsets only */ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type, - u32 *btf_id) + struct btf **btf, u32 *btf_id) { struct bpf_insn_access_aux info = { .reg_type = *reg_type, @@ -2780,10 +2783,12 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, */ *reg_type = info.reg_type; - if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) + if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) { + *btf = info.btf; *btf_id = info.btf_id; - else + } else { env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; + } /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) env->prog->aux->max_ctx_offset = off + size; @@ -3295,8 +3300,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, int value_regno) { struct bpf_reg_state *reg = regs + regno; - const struct btf_type *t = btf_type_by_id(btf_vmlinux, reg->btf_id); - const char *tname = btf_name_by_offset(btf_vmlinux, t->name_off); + const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id); + const char *tname = btf_name_by_offset(reg->btf, t->name_off); u32 btf_id; int ret; @@ -3317,23 +3322,23 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, } if (env->ops->btf_struct_access) { - ret = env->ops->btf_struct_access(&env->log, t, off, size, - atype, &btf_id); + ret = env->ops->btf_struct_access(&env->log, reg->btf, t, + off, size, atype, &btf_id); } else { if (atype != BPF_READ) { verbose(env, "only read is supported\n"); return -EACCES; } - ret = btf_struct_access(&env->log, t, off, size, atype, - &btf_id); + ret = btf_struct_access(&env->log, reg->btf, t, off, size, + atype, &btf_id); } if (ret < 0) return ret; if (atype == BPF_READ && value_regno >= 0) - mark_btf_ld_reg(env, regs, value_regno, ret, btf_id); + mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id); return 0; } @@ -3383,12 +3388,12 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, return -EACCES; } - ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id); + ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id); if (ret < 0) return ret; if (value_regno >= 0) - mark_btf_ld_reg(env, regs, value_regno, ret, btf_id); + mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id); return 0; } @@ -3464,6 +3469,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; + struct btf *btf = NULL; u32 btf_id = 0; if (t == BPF_WRITE && value_regno >= 0 && @@ -3476,7 +3482,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err < 0) return err; - err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf_id); + err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf, &btf_id); if (err) verbose_linfo(env, insn_idx, "; "); if (!err && t == BPF_READ && value_regno >= 0) { @@ -3498,8 +3504,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn */ regs[value_regno].subreg_def = DEF_NOT_SUBREG; if (reg_type == PTR_TO_BTF_ID || - reg_type == PTR_TO_BTF_ID_OR_NULL) + reg_type == PTR_TO_BTF_ID_OR_NULL) { + regs[value_regno].btf = btf; regs[value_regno].btf_id = btf_id; + } } regs[value_regno].type = reg_type; } @@ -3759,7 +3767,8 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, goto mark; if (state->stack[spi].slot_type[0] == STACK_SPILL && - state->stack[spi].spilled_ptr.type == SCALAR_VALUE) { + (state->stack[spi].spilled_ptr.type == SCALAR_VALUE || + env->allow_ptr_leaks)) { __mark_reg_unknown(env, &state->stack[spi].spilled_ptr); for (j = 0; j < BPF_REG_SIZE; j++) state->stack[spi].slot_type[j] = STACK_MISC; @@ -4116,11 +4125,11 @@ found: arg_btf_id = compatible->btf_id; } - if (!btf_struct_ids_match(&env->log, reg->off, reg->btf_id, - *arg_btf_id)) { + if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, + btf_vmlinux, *arg_btf_id)) { verbose(env, "R%d is of type %s but %s is expected\n", - regno, kernel_type_name(reg->btf_id), - kernel_type_name(*arg_btf_id)); + regno, kernel_type_name(reg->btf, reg->btf_id), + kernel_type_name(btf_vmlinux, *arg_btf_id)); return -EACCES; } @@ -4242,6 +4251,7 @@ skip_type_check: verbose(env, "Helper has invalid btf_id in R%d\n", regno); return -EACCES; } + meta->ret_btf = reg->btf; meta->ret_btf_id = reg->btf_id; } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { if (meta->func_id == BPF_FUNC_spin_lock) { @@ -4469,6 +4479,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_inode_storage_delete) goto error; break; + case BPF_MAP_TYPE_TASK_STORAGE: + if (func_id != BPF_FUNC_task_storage_get && + func_id != BPF_FUNC_task_storage_delete) + goto error; + break; default: break; } @@ -4547,6 +4562,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_INODE_STORAGE) goto error; break; + case BPF_FUNC_task_storage_get: + case BPF_FUNC_task_storage_delete: + if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) + goto error; + break; default: break; } @@ -4687,6 +4707,32 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) __clear_all_pkt_pointers(env, vstate->frame[i]); } +enum { + AT_PKT_END = -1, + BEYOND_PKT_END = -2, +}; + +static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range_open) +{ + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *reg = &state->regs[regn]; + + if (reg->type != PTR_TO_PACKET) + /* PTR_TO_PACKET_META is not supported yet */ + return; + + /* The 'reg' is pkt > pkt_end or pkt >= pkt_end. + * How far beyond pkt_end it goes is unknown. + * if (!range_open) it's the case of pkt >= pkt_end + * if (range_open) it's the case of pkt > pkt_end + * hence this pointer is at least 1 byte bigger than pkt_end + */ + if (range_open) + reg->range = BEYOND_PKT_END; + else + reg->range = AT_PKT_END; +} + static void release_reg_references(struct bpf_verifier_env *env, struct bpf_func_state *state, int ref_obj_id) @@ -4895,6 +4941,8 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type, ret_reg->smax_value = meta->msize_max_value; ret_reg->s32_max_value = meta->msize_max_value; + ret_reg->smin_value = -MAX_ERRNO; + ret_reg->s32_min_value = -MAX_ERRNO; __reg_deduce_bounds(ret_reg); __reg_bound_offset(ret_reg); __update_reg_bounds(ret_reg); @@ -5152,16 +5200,16 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn const struct btf_type *t; mark_reg_known_zero(env, regs, BPF_REG_0); - t = btf_type_skip_modifiers(btf_vmlinux, meta.ret_btf_id, NULL); + t = btf_type_skip_modifiers(meta.ret_btf, meta.ret_btf_id, NULL); if (!btf_type_is_struct(t)) { u32 tsize; const struct btf_type *ret; const char *tname; /* resolve the type size of ksym. */ - ret = btf_resolve_size(btf_vmlinux, t, &tsize); + ret = btf_resolve_size(meta.ret_btf, t, &tsize); if (IS_ERR(ret)) { - tname = btf_name_by_offset(btf_vmlinux, t->name_off); + tname = btf_name_by_offset(meta.ret_btf, t->name_off); verbose(env, "unable to resolve the size of type '%s': %ld\n", tname, PTR_ERR(ret)); return -EINVAL; @@ -5174,19 +5222,27 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL; + regs[BPF_REG_0].btf = meta.ret_btf; regs[BPF_REG_0].btf_id = meta.ret_btf_id; } - } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) { + } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL || + fn->ret_type == RET_PTR_TO_BTF_ID) { int ret_btf_id; mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL; + regs[BPF_REG_0].type = fn->ret_type == RET_PTR_TO_BTF_ID ? + PTR_TO_BTF_ID : + PTR_TO_BTF_ID_OR_NULL; ret_btf_id = *fn->ret_btf_id; if (ret_btf_id == 0) { verbose(env, "invalid return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); return -EINVAL; } + /* current BPF helper definitions are only coming from + * built-in code with type IDs from vmlinux BTF + */ + regs[BPF_REG_0].btf = btf_vmlinux; regs[BPF_REG_0].btf_id = ret_btf_id; } else { verbose(env, "unknown return type %d of func %s#%d\n", @@ -5586,7 +5642,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ - dst_reg->raw = 0; + memset(&dst_reg->raw, 0, sizeof(dst_reg->raw)); } break; case BPF_SUB: @@ -5651,7 +5707,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ if (smin_val < 0) - dst_reg->raw = 0; + memset(&dst_reg->raw, 0, sizeof(dst_reg->raw)); } break; case BPF_AND: @@ -6695,7 +6751,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) static void __find_good_pkt_pointers(struct bpf_func_state *state, struct bpf_reg_state *dst_reg, - enum bpf_reg_type type, u16 new_range) + enum bpf_reg_type type, int new_range) { struct bpf_reg_state *reg; int i; @@ -6720,8 +6776,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, enum bpf_reg_type type, bool range_right_open) { - u16 new_range; - int i; + int new_range, i; if (dst_reg->off < 0 || (dst_reg->off == 0 && range_right_open)) @@ -6972,6 +7027,67 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, return is_branch64_taken(reg, val, opcode); } +static int flip_opcode(u32 opcode) +{ + /* How can we transform "a <op> b" into "b <op> a"? */ + static const u8 opcode_flip[16] = { + /* these stay the same */ + [BPF_JEQ >> 4] = BPF_JEQ, + [BPF_JNE >> 4] = BPF_JNE, + [BPF_JSET >> 4] = BPF_JSET, + /* these swap "lesser" and "greater" (L and G in the opcodes) */ + [BPF_JGE >> 4] = BPF_JLE, + [BPF_JGT >> 4] = BPF_JLT, + [BPF_JLE >> 4] = BPF_JGE, + [BPF_JLT >> 4] = BPF_JGT, + [BPF_JSGE >> 4] = BPF_JSLE, + [BPF_JSGT >> 4] = BPF_JSLT, + [BPF_JSLE >> 4] = BPF_JSGE, + [BPF_JSLT >> 4] = BPF_JSGT + }; + return opcode_flip[opcode >> 4]; +} + +static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg, + u8 opcode) +{ + struct bpf_reg_state *pkt; + + if (src_reg->type == PTR_TO_PACKET_END) { + pkt = dst_reg; + } else if (dst_reg->type == PTR_TO_PACKET_END) { + pkt = src_reg; + opcode = flip_opcode(opcode); + } else { + return -1; + } + + if (pkt->range >= 0) + return -1; + + switch (opcode) { + case BPF_JLE: + /* pkt <= pkt_end */ + fallthrough; + case BPF_JGT: + /* pkt > pkt_end */ + if (pkt->range == BEYOND_PKT_END) + /* pkt has at last one extra byte beyond pkt_end */ + return opcode == BPF_JGT; + break; + case BPF_JLT: + /* pkt < pkt_end */ + fallthrough; + case BPF_JGE: + /* pkt >= pkt_end */ + if (pkt->range == BEYOND_PKT_END || pkt->range == AT_PKT_END) + return opcode == BPF_JGE; + break; + } + return -1; +} + /* Adjusts the register min/max values in the case that the dst_reg is the * variable register that we are working on, and src_reg is a constant or we're * simply doing a BPF_K check. @@ -7135,23 +7251,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, u64 val, u32 val32, u8 opcode, bool is_jmp32) { - /* How can we transform "a <op> b" into "b <op> a"? */ - static const u8 opcode_flip[16] = { - /* these stay the same */ - [BPF_JEQ >> 4] = BPF_JEQ, - [BPF_JNE >> 4] = BPF_JNE, - [BPF_JSET >> 4] = BPF_JSET, - /* these swap "lesser" and "greater" (L and G in the opcodes) */ - [BPF_JGE >> 4] = BPF_JLE, - [BPF_JGT >> 4] = BPF_JLT, - [BPF_JLE >> 4] = BPF_JGE, - [BPF_JLT >> 4] = BPF_JGT, - [BPF_JSGE >> 4] = BPF_JSLE, - [BPF_JSGT >> 4] = BPF_JSLT, - [BPF_JSLE >> 4] = BPF_JSGE, - [BPF_JSLT >> 4] = BPF_JSGT - }; - opcode = opcode_flip[opcode >> 4]; + opcode = flip_opcode(opcode); /* This uses zero as "not present in table"; luckily the zero opcode, * BPF_JA, can't get here. */ @@ -7333,6 +7433,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_data' > pkt_end, pkt_meta' > pkt_data */ find_good_pkt_pointers(this_branch, dst_reg, dst_reg->type, false); + mark_pkt_end(other_branch, insn->dst_reg, true); } else if ((dst_reg->type == PTR_TO_PACKET_END && src_reg->type == PTR_TO_PACKET) || (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && @@ -7340,6 +7441,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_end > pkt_data', pkt_data > pkt_meta' */ find_good_pkt_pointers(other_branch, src_reg, src_reg->type, true); + mark_pkt_end(this_branch, insn->src_reg, false); } else { return false; } @@ -7352,6 +7454,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_data' < pkt_end, pkt_meta' < pkt_data */ find_good_pkt_pointers(other_branch, dst_reg, dst_reg->type, true); + mark_pkt_end(this_branch, insn->dst_reg, false); } else if ((dst_reg->type == PTR_TO_PACKET_END && src_reg->type == PTR_TO_PACKET) || (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && @@ -7359,6 +7462,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_end < pkt_data', pkt_data > pkt_meta' */ find_good_pkt_pointers(this_branch, src_reg, src_reg->type, false); + mark_pkt_end(other_branch, insn->src_reg, true); } else { return false; } @@ -7371,6 +7475,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */ find_good_pkt_pointers(this_branch, dst_reg, dst_reg->type, true); + mark_pkt_end(other_branch, insn->dst_reg, false); } else if ((dst_reg->type == PTR_TO_PACKET_END && src_reg->type == PTR_TO_PACKET) || (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && @@ -7378,6 +7483,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_end >= pkt_data', pkt_data >= pkt_meta' */ find_good_pkt_pointers(other_branch, src_reg, src_reg->type, false); + mark_pkt_end(this_branch, insn->src_reg, true); } else { return false; } @@ -7390,6 +7496,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */ find_good_pkt_pointers(other_branch, dst_reg, dst_reg->type, false); + mark_pkt_end(this_branch, insn->dst_reg, true); } else if ((dst_reg->type == PTR_TO_PACKET_END && src_reg->type == PTR_TO_PACKET) || (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && @@ -7397,6 +7504,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_end <= pkt_data', pkt_data <= pkt_meta' */ find_good_pkt_pointers(this_branch, src_reg, src_reg->type, true); + mark_pkt_end(other_branch, insn->src_reg, false); } else { return false; } @@ -7496,6 +7604,10 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, src_reg->var_off.value, opcode, is_jmp32); + } else if (reg_is_pkt_pointer_any(dst_reg) && + reg_is_pkt_pointer_any(src_reg) && + !is_jmp32) { + pred = is_pkt_ptr_branch_taken(dst_reg, src_reg, opcode); } if (pred >= 0) { @@ -7504,7 +7616,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, */ if (!__is_pointer_value(false, dst_reg)) err = mark_chain_precision(env, insn->dst_reg); - if (BPF_SRC(insn->code) == BPF_X && !err) + if (BPF_SRC(insn->code) == BPF_X && !err && + !__is_pointer_value(false, src_reg)) err = mark_chain_precision(env, insn->src_reg); if (err) return err; @@ -7646,6 +7759,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) break; case PTR_TO_BTF_ID: case PTR_TO_PERCPU_BTF_ID: + dst_reg->btf = aux->btf_var.btf; dst_reg->btf_id = aux->btf_var.btf_id; break; default: @@ -7786,9 +7900,11 @@ static int check_return_code(struct bpf_verifier_env *env) struct tnum range = tnum_range(0, 1); enum bpf_prog_type prog_type = resolve_prog_type(env->prog); int err; + const bool is_subprog = env->cur_state->frame[0]->subprogno; /* LSM and struct_ops func-ptr's return type could be "void" */ - if ((prog_type == BPF_PROG_TYPE_STRUCT_OPS || + if (!is_subprog && + (prog_type == BPF_PROG_TYPE_STRUCT_OPS || prog_type == BPF_PROG_TYPE_LSM) && !prog->aux->attach_func_proto->type) return 0; @@ -7808,6 +7924,16 @@ static int check_return_code(struct bpf_verifier_env *env) return -EACCES; } + reg = cur_regs(env) + BPF_REG_0; + if (is_subprog) { + if (reg->type != SCALAR_VALUE) { + verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n", + reg_type_str[reg->type]); + return -EINVAL; + } + return 0; + } + switch (prog_type) { case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || @@ -7861,7 +7987,6 @@ static int check_return_code(struct bpf_verifier_env *env) return 0; } - reg = cur_regs(env) + BPF_REG_0; if (reg->type != SCALAR_VALUE) { verbose(env, "At program exit the register R0 is not a known value (%s)\n", reg_type_str[reg->type]); @@ -7949,6 +8074,11 @@ static void init_explored_state(struct bpf_verifier_env *env, int idx) env->insn_aux_data[idx].prune_point = true; } +enum { + DONE_EXPLORING = 0, + KEEP_EXPLORING = 1, +}; + /* t, w, e - match pseudo-code above: * t - index of current instruction * w - next instruction @@ -7961,10 +8091,10 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, int *insn_state = env->cfg.insn_state; if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) - return 0; + return DONE_EXPLORING; if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH)) - return 0; + return DONE_EXPLORING; if (w < 0 || w >= env->prog->len) { verbose_linfo(env, t, "%d: ", t); @@ -7983,10 +8113,10 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, if (env->cfg.cur_stack >= env->prog->len) return -E2BIG; insn_stack[env->cfg.cur_stack++] = w; - return 1; + return KEEP_EXPLORING; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { if (loop_ok && env->bpf_capable) - return 0; + return DONE_EXPLORING; verbose_linfo(env, t, "%d: ", t); verbose_linfo(env, w, "%d: ", w); verbose(env, "back-edge from insn %d to %d\n", t, w); @@ -7998,7 +8128,74 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, verbose(env, "insn state internal bug\n"); return -EFAULT; } - return 0; + return DONE_EXPLORING; +} + +/* Visits the instruction at index t and returns one of the following: + * < 0 - an error occurred + * DONE_EXPLORING - the instruction was fully explored + * KEEP_EXPLORING - there is still work to be done before it is fully explored + */ +static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) +{ + struct bpf_insn *insns = env->prog->insnsi; + int ret; + + /* All non-branch instructions have a single fall-through edge. */ + if (BPF_CLASS(insns[t].code) != BPF_JMP && + BPF_CLASS(insns[t].code) != BPF_JMP32) + return push_insn(t, t + 1, FALLTHROUGH, env, false); + + switch (BPF_OP(insns[t].code)) { + case BPF_EXIT: + return DONE_EXPLORING; + + case BPF_CALL: + ret = push_insn(t, t + 1, FALLTHROUGH, env, false); + if (ret) + return ret; + + if (t + 1 < insn_cnt) + init_explored_state(env, t + 1); + if (insns[t].src_reg == BPF_PSEUDO_CALL) { + init_explored_state(env, t); + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, + env, false); + } + return ret; + + case BPF_JA: + if (BPF_SRC(insns[t].code) != BPF_K) + return -EINVAL; + + /* unconditional jump with single edge */ + ret = push_insn(t, t + insns[t].off + 1, FALLTHROUGH, env, + true); + if (ret) + return ret; + + /* unconditional jmp is not a good pruning point, + * but it's marked, since backtracking needs + * to record jmp history in is_state_visited(). + */ + init_explored_state(env, t + insns[t].off + 1); + /* tell verifier to check for equivalent states + * after every call and jump + */ + if (t + 1 < insn_cnt) + init_explored_state(env, t + 1); + + return ret; + + default: + /* conditional jump with two edges */ + init_explored_state(env, t); + ret = push_insn(t, t + 1, FALLTHROUGH, env, true); + if (ret) + return ret; + + return push_insn(t, t + insns[t].off + 1, BRANCH, env, true); + } } /* non-recursive depth-first-search to detect loops in BPF program @@ -8006,11 +8203,10 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, */ static int check_cfg(struct bpf_verifier_env *env) { - struct bpf_insn *insns = env->prog->insnsi; int insn_cnt = env->prog->len; int *insn_stack, *insn_state; int ret = 0; - int i, t; + int i; insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) @@ -8026,92 +8222,32 @@ static int check_cfg(struct bpf_verifier_env *env) insn_stack[0] = 0; /* 0 is the first instruction */ env->cfg.cur_stack = 1; -peek_stack: - if (env->cfg.cur_stack == 0) - goto check_state; - t = insn_stack[env->cfg.cur_stack - 1]; - - if (BPF_CLASS(insns[t].code) == BPF_JMP || - BPF_CLASS(insns[t].code) == BPF_JMP32) { - u8 opcode = BPF_OP(insns[t].code); - - if (opcode == BPF_EXIT) { - goto mark_explored; - } else if (opcode == BPF_CALL) { - ret = push_insn(t, t + 1, FALLTHROUGH, env, false); - if (ret == 1) - goto peek_stack; - else if (ret < 0) - goto err_free; - if (t + 1 < insn_cnt) - init_explored_state(env, t + 1); - if (insns[t].src_reg == BPF_PSEUDO_CALL) { - init_explored_state(env, t); - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, - env, false); - if (ret == 1) - goto peek_stack; - else if (ret < 0) - goto err_free; - } - } else if (opcode == BPF_JA) { - if (BPF_SRC(insns[t].code) != BPF_K) { - ret = -EINVAL; - goto err_free; - } - /* unconditional jump with single edge */ - ret = push_insn(t, t + insns[t].off + 1, - FALLTHROUGH, env, true); - if (ret == 1) - goto peek_stack; - else if (ret < 0) - goto err_free; - /* unconditional jmp is not a good pruning point, - * but it's marked, since backtracking needs - * to record jmp history in is_state_visited(). - */ - init_explored_state(env, t + insns[t].off + 1); - /* tell verifier to check for equivalent states - * after every call and jump - */ - if (t + 1 < insn_cnt) - init_explored_state(env, t + 1); - } else { - /* conditional jump with two edges */ - init_explored_state(env, t); - ret = push_insn(t, t + 1, FALLTHROUGH, env, true); - if (ret == 1) - goto peek_stack; - else if (ret < 0) - goto err_free; + while (env->cfg.cur_stack > 0) { + int t = insn_stack[env->cfg.cur_stack - 1]; - ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true); - if (ret == 1) - goto peek_stack; - else if (ret < 0) - goto err_free; - } - } else { - /* all other non-branch instructions with single - * fall-through edge - */ - ret = push_insn(t, t + 1, FALLTHROUGH, env, false); - if (ret == 1) - goto peek_stack; - else if (ret < 0) + ret = visit_insn(t, insn_cnt, env); + switch (ret) { + case DONE_EXPLORING: + insn_state[t] = EXPLORED; + env->cfg.cur_stack--; + break; + case KEEP_EXPLORING: + break; + default: + if (ret > 0) { + verbose(env, "visit_insn internal bug\n"); + ret = -EFAULT; + } goto err_free; + } } -mark_explored: - insn_state[t] = EXPLORED; - if (env->cfg.cur_stack-- <= 0) { + if (env->cfg.cur_stack < 0) { verbose(env, "pop stack internal bug\n"); ret = -EFAULT; goto err_free; } - goto peek_stack; -check_state: for (i = 0; i < insn_cnt; i++) { if (insn_state[i] != EXPLORED) { verbose(env, "unreachable insn %d\n", i); @@ -9572,12 +9708,13 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn_aux_data *aux) { - u32 datasec_id, type, id = insn->imm; const struct btf_var_secinfo *vsi; const struct btf_type *datasec; const struct btf_type *t; const char *sym_name; bool percpu = false; + u32 type, id = insn->imm; + s32 datasec_id; u64 addr; int i; @@ -9630,6 +9767,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, t = btf_type_skip_modifiers(btf_vmlinux, type, NULL); if (percpu) { aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID; + aux->btf_var.btf = btf_vmlinux; aux->btf_var.btf_id = type; } else if (!btf_type_is_struct(t)) { const struct btf_type *ret; @@ -9648,6 +9786,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, aux->btf_var.mem_size = tsize; } else { aux->btf_var.reg_type = PTR_TO_BTF_ID; + aux->btf_var.btf = btf_vmlinux; aux->btf_var.btf_id = type; } return 0; @@ -9719,11 +9858,21 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n"); } - if ((is_tracing_prog_type(prog_type) || - prog_type == BPF_PROG_TYPE_SOCKET_FILTER) && - map_value_has_spin_lock(map)) { - verbose(env, "tracing progs cannot use bpf_spin_lock yet\n"); - return -EINVAL; + if (map_value_has_spin_lock(map)) { + if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { + verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n"); + return -EINVAL; + } + + if (is_tracing_prog_type(prog_type)) { + verbose(env, "tracing progs cannot use bpf_spin_lock yet\n"); + return -EINVAL; + } + + if (prog->aux->sleepable) { + verbose(env, "sleepable progs cannot use bpf_spin_lock yet\n"); + return -EINVAL; + } } if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && @@ -11454,20 +11603,6 @@ static int check_attach_modify_return(unsigned long addr, const char *func_name) return -EINVAL; } -/* non exhaustive list of sleepable bpf_lsm_*() functions */ -BTF_SET_START(btf_sleepable_lsm_hooks) -#ifdef CONFIG_BPF_LSM -BTF_ID(func, bpf_lsm_bprm_committed_creds) -#else -BTF_ID_UNUSED -#endif -BTF_SET_END(btf_sleepable_lsm_hooks) - -static int check_sleepable_lsm_hook(u32 btf_id) -{ - return btf_id_set_contains(&btf_sleepable_lsm_hooks, btf_id); -} - /* list of non-sleepable functions that are otherwise on * ALLOW_ERROR_INJECTION list */ @@ -11504,7 +11639,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, bpf_log(log, "Tracing programs must provide btf_id\n"); return -EINVAL; } - btf = tgt_prog ? tgt_prog->aux->btf : btf_vmlinux; + btf = tgt_prog ? tgt_prog->aux->btf : prog->aux->attach_btf; if (!btf) { bpf_log(log, "FENTRY/FEXIT program can only be attached to another program annotated with BTF\n"); @@ -11689,7 +11824,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, /* LSM progs check that they are attached to bpf_lsm_*() funcs. * Only some of them are sleepable. */ - if (check_sleepable_lsm_hook(btf_id)) + if (bpf_lsm_is_sleepable_hook(btf_id)) ret = 0; break; default: @@ -11780,7 +11915,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return ret; } - key = bpf_trampoline_compute_key(tgt_prog, btf_id); + key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id); tr = bpf_trampoline_get(key, &tgt_info); if (!tr) return -ENOMEM; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e41c21819ba0..fefa21981027 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -199,7 +199,7 @@ static u16 have_canfork_callback __read_mostly; /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { - .count = REFCOUNT_INIT(2), + .ns.count = REFCOUNT_INIT(2), .user_ns = &init_user_ns, .ns.ops = &cgroupns_operations, .ns.inum = PROC_CGROUP_INIT_INO, @@ -281,9 +281,6 @@ bool cgroup_ssid_enabled(int ssid) * - cpuset: a task can be moved into an empty cpuset, and again it takes * masks of ancestors. * - * - memcg: use_hierarchy is on by default and the cgroup file for the flag - * is not created. - * * - blkcg: blk-throttle becomes properly hierarchical. * * - debug: disallowed on the default hierarchy. @@ -5152,15 +5149,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, if (err) goto err_list_del; - if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && - cgroup_parent(parent)) { - pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", - current->comm, current->pid, ss->name); - if (!strcmp(ss->name, "memory")) - pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n"); - ss->warned_broken_hierarchy = true; - } - return css; err_list_del: diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 57b5b5d0a5fd..53c70c470a38 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -983,25 +983,48 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], */ static void rebuild_sched_domains_locked(void) { + struct cgroup_subsys_state *pos_css; struct sched_domain_attr *attr; cpumask_var_t *doms; + struct cpuset *cs; int ndoms; lockdep_assert_cpus_held(); percpu_rwsem_assert_held(&cpuset_rwsem); /* - * We have raced with CPU hotplug. Don't do anything to avoid + * If we have raced with CPU hotplug, return early to avoid * passing doms with offlined cpu to partition_sched_domains(). - * Anyways, hotplug work item will rebuild sched domains. + * Anyways, cpuset_hotplug_workfn() will rebuild sched domains. + * + * With no CPUs in any subpartitions, top_cpuset's effective CPUs + * should be the same as the active CPUs, so checking only top_cpuset + * is enough to detect racing CPU offlines. */ if (!top_cpuset.nr_subparts_cpus && !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) return; - if (top_cpuset.nr_subparts_cpus && - !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask)) - return; + /* + * With subpartition CPUs, however, the effective CPUs of a partition + * root should be only a subset of the active CPUs. Since a CPU in any + * partition root could be offlined, all must be checked. + */ + if (top_cpuset.nr_subparts_cpus) { + rcu_read_lock(); + cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { + if (!is_partition_root(cs)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } + if (!cpumask_subset(cs->effective_cpus, + cpu_active_mask)) { + rcu_read_unlock(); + return; + } + } + rcu_read_unlock(); + } /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 812a61afd538..f5e8828c109c 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -32,7 +32,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) kfree(new_ns); return ERR_PTR(ret); } - refcount_set(&new_ns->count, 1); + refcount_set(&new_ns->ns.count, 1); new_ns->ns.ops = &cgroupns_operations; return new_ns; } diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index 81e9af7dcec2..53d688bdd894 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -111,7 +111,6 @@ CONFIG_STRICT_KERNEL_RWX=y CONFIG_SUSPEND_TIME=y CONFIG_TABLET_USB_ACECAD=y CONFIG_TABLET_USB_AIPTEK=y -CONFIG_TABLET_USB_GTCO=y CONFIG_TABLET_USB_HANWANG=y CONFIG_TABLET_USB_KBTAB=y CONFIG_TASKSTATS=y diff --git a/kernel/cpu.c b/kernel/cpu.c index 6ff2578ecf17..4e11e91010e1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -815,6 +815,10 @@ void __init cpuhp_threads_init(void) } #ifdef CONFIG_HOTPLUG_CPU +#ifndef arch_clear_mm_cpumask_cpu +#define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm)) +#endif + /** * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU * @cpu: a CPU id @@ -850,7 +854,7 @@ void clear_tasks_mm_cpumask(int cpu) t = find_lock_task_mm(p); if (!t) continue; - cpumask_clear_cpu(cpu, mm_cpumask(t->mm)); + arch_clear_mm_cpumask_cpu(cpu, t->mm); task_unlock(t); } rcu_read_unlock(); @@ -1602,7 +1606,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { .name = "ap:online", }, /* - * Handled on controll processor until the plugged processor manages + * Handled on control processor until the plugged processor manages * this itself. */ [CPUHP_TEARDOWN_CPU] = { @@ -1611,6 +1615,13 @@ static struct cpuhp_step cpuhp_hp_states[] = { .teardown.single = takedown_cpu, .cant_stop = true, }, + + [CPUHP_AP_SCHED_WAIT_EMPTY] = { + .name = "sched:waitempty", + .startup.single = NULL, + .teardown.single = sched_cpu_wait_empty, + }, + /* Handle smpboot threads park/unpark */ [CPUHP_AP_SMPBOOT_THREADS] = { .name = "smpboot/threads:online", diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 106e4500fd53..4fcfe0b70c4e 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -11,7 +11,7 @@ #include <asm/page.h> #include <asm/sections.h> -#include <crypto/sha.h> +#include <crypto/sha1.h> /* vmcoreinfo stuff */ unsigned char *vmcoreinfo_data; diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 1e75a8923a8d..af6e8b4fb359 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -225,8 +225,6 @@ NOKPROBE_SYMBOL(kgdb_skipexception); * Default (weak) implementation for kgdb_roundup_cpus */ -static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd); - void __weak kgdb_call_nmi_hook(void *ignored) { /* @@ -241,6 +239,9 @@ void __weak kgdb_call_nmi_hook(void *ignored) } NOKPROBE_SYMBOL(kgdb_call_nmi_hook); +static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd) = + CSD_INIT(kgdb_call_nmi_hook, NULL); + void __weak kgdb_roundup_cpus(void) { call_single_data_t *csd; @@ -267,7 +268,6 @@ void __weak kgdb_roundup_cpus(void) continue; kgdb_info[cpu].rounding_up = true; - csd->func = kgdb_call_nmi_hook; ret = smp_call_function_single_async(cpu, csd); if (ret) kgdb_info[cpu].rounding_up = false; diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index b4eea0abc3f0..781b9dca197c 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -229,6 +229,7 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; } io_tlb_index = 0; + no_iotlb_memory = false; if (verbose) swiotlb_print_info(); @@ -260,9 +261,11 @@ swiotlb_init(int verbose) if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) return; - if (io_tlb_start) + if (io_tlb_start) { memblock_free_early(io_tlb_start, PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + io_tlb_start = 0; + } pr_warn("Cannot allocate buffer"); no_iotlb_memory = true; } @@ -360,6 +363,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; } io_tlb_index = 0; + no_iotlb_memory = false; swiotlb_print_info(); @@ -441,14 +445,11 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, } } -phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, - dma_addr_t tbl_dma_addr, - phys_addr_t orig_addr, - size_t mapping_size, - size_t alloc_size, - enum dma_data_direction dir, - unsigned long attrs) +phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr, + size_t mapping_size, size_t alloc_size, + enum dma_data_direction dir, unsigned long attrs) { + dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(hwdev, io_tlb_start); unsigned long flags; phys_addr_t tlb_addr; unsigned int nslots, stride, index, wrap; @@ -667,9 +668,8 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size, swiotlb_force); - swiotlb_addr = swiotlb_tbl_map_single(dev, - phys_to_dma_unencrypted(dev, io_tlb_start), - paddr, size, size, dir, attrs); + swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, dir, + attrs); if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; diff --git a/kernel/elfcore.c b/kernel/elfcore.c deleted file mode 100644 index 57fb4dcff434..000000000000 --- a/kernel/elfcore.c +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/elf.h> -#include <linux/fs.h> -#include <linux/mm.h> -#include <linux/binfmts.h> -#include <linux/elfcore.h> - -Elf_Half __weak elf_core_extra_phdrs(void) -{ - return 0; -} - -int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset) -{ - return 1; -} - -int __weak elf_core_write_extra_data(struct coredump_params *cprm) -{ - return 1; -} - -size_t __weak elf_core_extra_data_size(void) -{ - return 0; -} diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile index 34c8a3f1c735..095c775e001e 100644 --- a/kernel/entry/Makefile +++ b/kernel/entry/Makefile @@ -9,5 +9,5 @@ KCOV_INSTRUMENT := n CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong CFLAGS_common.o += -fno-stack-protector -obj-$(CONFIG_GENERIC_ENTRY) += common.o +obj-$(CONFIG_GENERIC_ENTRY) += common.o syscall_user_dispatch.o obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK) += kvm.o diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 2b8366693d5c..378341642f94 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -2,23 +2,17 @@ #include <linux/context_tracking.h> #include <linux/entry-common.h> +#include <linux/highmem.h> #include <linux/livepatch.h> #include <linux/audit.h> +#include "common.h" + #define CREATE_TRACE_POINTS #include <trace/events/syscalls.h> -/** - * enter_from_user_mode - Establish state when coming from user mode - * - * Syscall/interrupt entry disables interrupts, but user mode is traced as - * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. - * - * 1) Tell lockdep that interrupts are disabled - * 2) Invoke context tracking if enabled to reactivate RCU - * 3) Trace interrupts off state - */ -static __always_inline void enter_from_user_mode(struct pt_regs *regs) +/* See comment for enter_from_user_mode() in entry-common.h */ +static __always_inline void __enter_from_user_mode(struct pt_regs *regs) { arch_check_user_regs(regs); lockdep_hardirqs_off(CALLER_ADDR0); @@ -31,6 +25,11 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs) instrumentation_end(); } +void noinstr enter_from_user_mode(struct pt_regs *regs) +{ + __enter_from_user_mode(regs); +} + static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) { if (unlikely(audit_context())) { @@ -42,19 +41,29 @@ static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) } static long syscall_trace_enter(struct pt_regs *regs, long syscall, - unsigned long ti_work) + unsigned long work) { long ret = 0; + /* + * Handle Syscall User Dispatch. This must comes first, since + * the ABI here can be something that doesn't make sense for + * other syscall_work features. + */ + if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { + if (syscall_user_dispatch(regs)) + return -1L; + } + /* Handle ptrace */ - if (ti_work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) { + if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { ret = arch_syscall_enter_tracehook(regs); - if (ret || (ti_work & _TIF_SYSCALL_EMU)) + if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) return -1L; } /* Do seccomp after ptrace, to catch any tracer changes. */ - if (ti_work & _TIF_SECCOMP) { + if (work & SYSCALL_WORK_SECCOMP) { ret = __secure_computing(NULL); if (ret == -1L) return ret; @@ -63,7 +72,7 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, /* Either of the above might have changed the syscall number */ syscall = syscall_get_nr(current, regs); - if (unlikely(ti_work & _TIF_SYSCALL_TRACEPOINT)) + if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) trace_sys_enter(regs, syscall); syscall_enter_audit(regs, syscall); @@ -74,11 +83,10 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, static __always_inline long __syscall_enter_from_user_work(struct pt_regs *regs, long syscall) { - unsigned long ti_work; + unsigned long work = READ_ONCE(current_thread_info()->syscall_work); - ti_work = READ_ONCE(current_thread_info()->flags); - if (ti_work & SYSCALL_ENTER_WORK) - syscall = syscall_trace_enter(regs, syscall, ti_work); + if (work & SYSCALL_WORK_ENTER) + syscall = syscall_trace_enter(regs, syscall, work); return syscall; } @@ -92,7 +100,7 @@ noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) { long ret; - enter_from_user_mode(regs); + __enter_from_user_mode(regs); instrumentation_begin(); local_irq_enable(); @@ -104,25 +112,14 @@ noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) { - enter_from_user_mode(regs); + __enter_from_user_mode(regs); instrumentation_begin(); local_irq_enable(); instrumentation_end(); } -/** - * exit_to_user_mode - Fixup state when exiting to user mode - * - * Syscall/interupt exit enables interrupts, but the kernel state is - * interrupts disabled when this is invoked. Also tell RCU about it. - * - * 1) Trace interrupts on state - * 2) Invoke context tracking if enabled to adjust RCU state - * 3) Invoke architecture specific last minute exit code, e.g. speculation - * mitigations, etc. - * 4) Tell lockdep that interrupts are enabled - */ -static __always_inline void exit_to_user_mode(void) +/* See comment for exit_to_user_mode() in entry-common.h */ +static __always_inline void __exit_to_user_mode(void) { instrumentation_begin(); trace_hardirqs_on_prepare(); @@ -134,8 +131,21 @@ static __always_inline void exit_to_user_mode(void) lockdep_hardirqs_on(CALLER_ADDR0); } +void noinstr exit_to_user_mode(void) +{ + __exit_to_user_mode(); +} + /* Workaround to allow gradual conversion of architecture code */ -void __weak arch_do_signal(struct pt_regs *regs) { } +void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { } + +static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work) +{ + if (ti_work & _TIF_NOTIFY_SIGNAL) + tracehook_notify_signal(); + + arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING); +} static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work) @@ -157,8 +167,8 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, if (ti_work & _TIF_PATCH_PENDING) klp_update_patch_state(current); - if (ti_work & _TIF_SIGPENDING) - arch_do_signal(regs); + if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) + handle_signal_work(regs, ti_work); if (ti_work & _TIF_NOTIFY_RESUME) { tracehook_notify_resume(regs); @@ -194,40 +204,56 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) /* Ensure that the address limit is intact and no locks are held */ addr_limit_user_check(); + kmap_assert_nomap(); lockdep_assert_irqs_disabled(); lockdep_sys_exit(); } #ifndef _TIF_SINGLESTEP -static inline bool report_single_step(unsigned long ti_work) +static inline bool report_single_step(unsigned long work) { return false; } #else /* - * If TIF_SYSCALL_EMU is set, then the only reason to report is when + * If SYSCALL_EMU is set, then the only reason to report is when * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall * instruction has been already reported in syscall_enter_from_user_mode(). */ -#define SYSEMU_STEP (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU) - -static inline bool report_single_step(unsigned long ti_work) +static inline bool report_single_step(unsigned long work) { - return (ti_work & SYSEMU_STEP) == _TIF_SINGLESTEP; + if (!(work & SYSCALL_WORK_SYSCALL_EMU)) + return false; + + return !!(current_thread_info()->flags & _TIF_SINGLESTEP); } #endif -static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work) + +static void syscall_exit_work(struct pt_regs *regs, unsigned long work) { bool step; + /* + * If the syscall was rolled back due to syscall user dispatching, + * then the tracers below are not invoked for the same reason as + * the entry side was not invoked in syscall_trace_enter(): The ABI + * of these syscalls is unknown. + */ + if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { + if (unlikely(current->syscall_dispatch.on_dispatch)) { + current->syscall_dispatch.on_dispatch = false; + return; + } + } + audit_syscall_exit(regs); - if (ti_work & _TIF_SYSCALL_TRACEPOINT) + if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) trace_sys_exit(regs, syscall_get_return_value(current, regs)); - step = report_single_step(ti_work); - if (step || ti_work & _TIF_SYSCALL_TRACE) + step = report_single_step(work); + if (step || work & SYSCALL_WORK_SYSCALL_TRACE) arch_syscall_exit_tracehook(regs, step); } @@ -237,7 +263,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work) */ static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) { - u32 cached_flags = READ_ONCE(current_thread_info()->flags); + unsigned long work = READ_ONCE(current_thread_info()->syscall_work); unsigned long nr = syscall_get_nr(current, regs); CT_WARN_ON(ct_state() != CONTEXT_KERNEL); @@ -254,23 +280,33 @@ static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) * enabled, we want to run them exactly once per syscall exit with * interrupts enabled. */ - if (unlikely(cached_flags & SYSCALL_EXIT_WORK)) - syscall_exit_work(regs, cached_flags); + if (unlikely(work & SYSCALL_WORK_EXIT)) + syscall_exit_work(regs, work); } -__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) +static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs) { - instrumentation_begin(); syscall_exit_to_user_mode_prepare(regs); local_irq_disable_exit_to_user(); exit_to_user_mode_prepare(regs); +} + +void syscall_exit_to_user_mode_work(struct pt_regs *regs) +{ + __syscall_exit_to_user_mode_work(regs); +} + +__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) +{ + instrumentation_begin(); + __syscall_exit_to_user_mode_work(regs); instrumentation_end(); - exit_to_user_mode(); + __exit_to_user_mode(); } noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) { - enter_from_user_mode(regs); + __enter_from_user_mode(regs); } noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) @@ -278,7 +314,7 @@ noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) instrumentation_begin(); exit_to_user_mode_prepare(regs); instrumentation_end(); - exit_to_user_mode(); + __exit_to_user_mode(); } noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) @@ -296,7 +332,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * If this entry hit the idle task invoke rcu_irq_enter() whether * RCU is watching or not. * - * Interupts can nest when the first interrupt invokes softirq + * Interrupts can nest when the first interrupt invokes softirq * processing on return which enables interrupts. * * Scheduler ticks in the idle task can mark quiescent state and @@ -307,7 +343,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * interrupt to invoke rcu_irq_enter(). If that nested interrupt is * the tick then rcu_flavor_sched_clock_irq() would wrongfully * assume that it is the first interupt and eventually claim - * quiescient state and end grace periods prematurely. + * quiescent state and end grace periods prematurely. * * Unconditionally invoke rcu_irq_enter() so RCU state stays * consistent. @@ -319,7 +355,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) /* * If RCU is not watching then the same careful * sequence vs. lockdep and tracing is required - * as in irq_enter_from_user_mode(). + * as in irqentry_enter_from_user_mode(). */ lockdep_hardirqs_off(CALLER_ADDR0); rcu_irq_enter(); @@ -337,10 +373,10 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * already contains a warning when RCU is not watching, so no point * in having another one here. */ + lockdep_hardirqs_off(CALLER_ADDR0); instrumentation_begin(); rcu_irq_enter_check_tick(); - /* Use the combo lockdep/tracing function */ - trace_hardirqs_off(); + trace_hardirqs_off_finish(); instrumentation_end(); return ret; @@ -397,3 +433,39 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) rcu_irq_exit(); } } + +irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) +{ + irqentry_state_t irq_state; + + irq_state.lockdep = lockdep_hardirqs_enabled(); + + __nmi_enter(); + lockdep_hardirqs_off(CALLER_ADDR0); + lockdep_hardirq_enter(); + rcu_nmi_enter(); + + instrumentation_begin(); + trace_hardirqs_off_finish(); + ftrace_nmi_enter(); + instrumentation_end(); + + return irq_state; +} + +void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state) +{ + instrumentation_begin(); + ftrace_nmi_exit(); + if (irq_state.lockdep) { + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + } + instrumentation_end(); + + rcu_nmi_exit(); + lockdep_hardirq_exit(); + if (irq_state.lockdep) + lockdep_hardirqs_on(CALLER_ADDR0); + __nmi_exit(); +} diff --git a/kernel/entry/common.h b/kernel/entry/common.h new file mode 100644 index 000000000000..f6e6d02f07fe --- /dev/null +++ b/kernel/entry/common.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _COMMON_H +#define _COMMON_H + +bool syscall_user_dispatch(struct pt_regs *regs); + +#endif diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c index b6678a5e3cf6..49972ee99aff 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/kvm.c @@ -8,6 +8,9 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) do { int ret; + if (ti_work & _TIF_NOTIFY_SIGNAL) + tracehook_notify_signal(); + if (ti_work & _TIF_SIGPENDING) { kvm_handle_signal_exit(vcpu); return -EINTR; diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c new file mode 100644 index 000000000000..b0338a5625d9 --- /dev/null +++ b/kernel/entry/syscall_user_dispatch.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020 Collabora Ltd. + */ +#include <linux/sched.h> +#include <linux/prctl.h> +#include <linux/syscall_user_dispatch.h> +#include <linux/uaccess.h> +#include <linux/signal.h> +#include <linux/elf.h> + +#include <linux/sched/signal.h> +#include <linux/sched/task_stack.h> + +#include <asm/syscall.h> + +#include "common.h" + +static void trigger_sigsys(struct pt_regs *regs) +{ + struct kernel_siginfo info; + + clear_siginfo(&info); + info.si_signo = SIGSYS; + info.si_code = SYS_USER_DISPATCH; + info.si_call_addr = (void __user *)KSTK_EIP(current); + info.si_errno = 0; + info.si_arch = syscall_get_arch(current); + info.si_syscall = syscall_get_nr(current, regs); + + force_sig_info(&info); +} + +bool syscall_user_dispatch(struct pt_regs *regs) +{ + struct syscall_user_dispatch *sd = ¤t->syscall_dispatch; + char state; + + if (likely(instruction_pointer(regs) - sd->offset < sd->len)) + return false; + + if (unlikely(arch_syscall_is_vdso_sigreturn(regs))) + return false; + + if (likely(sd->selector)) { + /* + * access_ok() is performed once, at prctl time, when + * the selector is loaded by userspace. + */ + if (unlikely(__get_user(state, sd->selector))) + do_exit(SIGSEGV); + + if (likely(state == PR_SYS_DISPATCH_OFF)) + return false; + + if (state != PR_SYS_DISPATCH_ON) + do_exit(SIGSYS); + } + + sd->on_dispatch = true; + syscall_rollback(current, regs); + trigger_sigsys(regs); + + return true; +} + +int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, + unsigned long len, char __user *selector) +{ + switch (mode) { + case PR_SYS_DISPATCH_OFF: + if (offset || len || selector) + return -EINVAL; + break; + case PR_SYS_DISPATCH_ON: + /* + * Validate the direct dispatcher region just for basic + * sanity against overflow and a 0-sized dispatcher + * region. If the user is able to submit a syscall from + * an address, that address is obviously valid. + */ + if (offset && offset + len <= offset) + return -EINVAL; + + if (selector && !access_ok(selector, sizeof(*selector))) + return -EFAULT; + + break; + default: + return -EINVAL; + } + + current->syscall_dispatch.selector = selector; + current->syscall_dispatch.offset = offset; + current->syscall_dispatch.len = len; + current->syscall_dispatch.on_dispatch = false; + + if (mode == PR_SYS_DISPATCH_ON) + set_syscall_work(SYSCALL_USER_DISPATCH); + else + clear_syscall_work(SYSCALL_USER_DISPATCH); + + return 0; +} diff --git a/kernel/events/core.c b/kernel/events/core.c index da467e1dd49a..19ae6c931c52 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -51,6 +51,8 @@ #include <linux/proc_ns.h> #include <linux/mount.h> #include <linux/min_heap.h> +#include <linux/highmem.h> +#include <linux/pgtable.h> #include "internal.h" @@ -1895,6 +1897,12 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) if (sample_type & PERF_SAMPLE_CGROUP) size += sizeof(data->cgroup); + if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) + size += sizeof(data->data_page_size); + + if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) + size += sizeof(data->code_page_size); + event->header_size = size; } @@ -2312,9 +2320,6 @@ group_sched_out(struct perf_event *group_event, event_sched_out(event, cpuctx, ctx); perf_pmu_enable(ctx->pmu); - - if (group_event->attr.exclusive) - cpuctx->exclusive = 0; } #define DETACH_GROUP 0x01UL @@ -2583,11 +2588,8 @@ group_sched_in(struct perf_event *group_event, pmu->start_txn(pmu, PERF_PMU_TXN_ADD); - if (event_sched_in(group_event, cpuctx, ctx)) { - pmu->cancel_txn(pmu); - perf_mux_hrtimer_restart(cpuctx); - return -EAGAIN; - } + if (event_sched_in(group_event, cpuctx, ctx)) + goto error; /* * Schedule in siblings as one group (if any): @@ -2616,10 +2618,8 @@ group_error: } event_sched_out(group_event, cpuctx, ctx); +error: pmu->cancel_txn(pmu); - - perf_mux_hrtimer_restart(cpuctx); - return -EAGAIN; } @@ -2645,7 +2645,7 @@ static int group_can_go_on(struct perf_event *event, * If this group is exclusive and there are already * events on the CPU, it can't go on. */ - if (event->attr.exclusive && cpuctx->active_oncpu) + if (event->attr.exclusive && !list_empty(get_event_list(event))) return 0; /* * Otherwise, try to add it if all previous groups were able @@ -3679,6 +3679,7 @@ static int merge_sched_in(struct perf_event *event, void *data) *can_add_hw = 0; ctx->rotate_necessary = 1; + perf_mux_hrtimer_restart(cpuctx); } return 0; @@ -6374,14 +6375,13 @@ perf_output_sample_regs(struct perf_output_handle *handle, } static void perf_sample_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { if (user_mode(regs)) { regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; } else if (!(current->flags & PF_KTHREAD)) { - perf_get_regs_user(regs_user, regs, regs_user_copy); + perf_get_regs_user(regs_user, regs); } else { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; regs_user->regs = NULL; @@ -6939,6 +6939,12 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_CGROUP) perf_output_put(handle, data->cgroup); + if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) + perf_output_put(handle, data->data_page_size); + + if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) + perf_output_put(handle, data->code_page_size); + if (sample_type & PERF_SAMPLE_AUX) { perf_output_put(handle, data->aux_size); @@ -6996,6 +7002,93 @@ static u64 perf_virt_to_phys(u64 virt) return phys_addr; } +/* + * Return the pagetable size of a given virtual address. + */ +static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) +{ + u64 size = 0; + +#ifdef CONFIG_HAVE_FAST_GUP + pgd_t *pgdp, pgd; + p4d_t *p4dp, p4d; + pud_t *pudp, pud; + pmd_t *pmdp, pmd; + pte_t *ptep, pte; + + pgdp = pgd_offset(mm, addr); + pgd = READ_ONCE(*pgdp); + if (pgd_none(pgd)) + return 0; + + if (pgd_leaf(pgd)) + return pgd_leaf_size(pgd); + + p4dp = p4d_offset_lockless(pgdp, pgd, addr); + p4d = READ_ONCE(*p4dp); + if (!p4d_present(p4d)) + return 0; + + if (p4d_leaf(p4d)) + return p4d_leaf_size(p4d); + + pudp = pud_offset_lockless(p4dp, p4d, addr); + pud = READ_ONCE(*pudp); + if (!pud_present(pud)) + return 0; + + if (pud_leaf(pud)) + return pud_leaf_size(pud); + + pmdp = pmd_offset_lockless(pudp, pud, addr); + pmd = READ_ONCE(*pmdp); + if (!pmd_present(pmd)) + return 0; + + if (pmd_leaf(pmd)) + return pmd_leaf_size(pmd); + + ptep = pte_offset_map(&pmd, addr); + pte = ptep_get_lockless(ptep); + if (pte_present(pte)) + size = pte_leaf_size(pte); + pte_unmap(ptep); +#endif /* CONFIG_HAVE_FAST_GUP */ + + return size; +} + +static u64 perf_get_page_size(unsigned long addr) +{ + struct mm_struct *mm; + unsigned long flags; + u64 size; + + if (!addr) + return 0; + + /* + * Software page-table walkers must disable IRQs, + * which prevents any tear down of the page tables. + */ + local_irq_save(flags); + + mm = current->mm; + if (!mm) { + /* + * For kernel threads and the like, use init_mm so that + * we can find kernel memory. + */ + mm = &init_mm; + } + + size = perf_get_pgtable_size(mm, addr); + + local_irq_restore(flags); + + return size; +} + static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; struct perf_callchain_entry * @@ -7031,7 +7124,7 @@ void perf_prepare_sample(struct perf_event_header *header, __perf_event_header__init_id(header, data, event); - if (sample_type & PERF_SAMPLE_IP) + if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE)) data->ip = perf_instruction_pointer(regs); if (sample_type & PERF_SAMPLE_CALLCHAIN) { @@ -7083,8 +7176,7 @@ void perf_prepare_sample(struct perf_event_header *header, } if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) - perf_sample_regs_user(&data->regs_user, regs, - &data->regs_user_copy); + perf_sample_regs_user(&data->regs_user, regs); if (sample_type & PERF_SAMPLE_REGS_USER) { /* regs dump ABI info */ @@ -7151,6 +7243,17 @@ void perf_prepare_sample(struct perf_event_header *header, } #endif + /* + * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't + * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr, + * but the value will not dump to the userspace. + */ + if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) + data->data_page_size = perf_get_page_size(data->addr); + + if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) + data->code_page_size = perf_get_page_size(data->ip); + if (sample_type & PERF_SAMPLE_AUX) { u64 size; @@ -7186,6 +7289,7 @@ __perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs, int (*output_begin)(struct perf_output_handle *, + struct perf_sample_data *, struct perf_event *, unsigned int)) { @@ -7198,7 +7302,7 @@ __perf_event_output(struct perf_event *event, perf_prepare_sample(&header, data, event, regs); - err = output_begin(&handle, event, header.size); + err = output_begin(&handle, data, event, header.size); if (err) goto exit; @@ -7264,7 +7368,7 @@ perf_event_read_event(struct perf_event *event, int ret; perf_event_header__init_id(&read_event.header, &sample, event); - ret = perf_output_begin(&handle, event, read_event.header.size); + ret = perf_output_begin(&handle, &sample, event, read_event.header.size); if (ret) return; @@ -7533,7 +7637,7 @@ static void perf_event_task_output(struct perf_event *event, perf_event_header__init_id(&task_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, task_event->event_id.header.size); if (ret) goto out; @@ -7636,7 +7740,7 @@ static void perf_event_comm_output(struct perf_event *event, return; perf_event_header__init_id(&comm_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, comm_event->event_id.header.size); if (ret) @@ -7736,7 +7840,7 @@ static void perf_event_namespaces_output(struct perf_event *event, perf_event_header__init_id(&namespaces_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, namespaces_event->event_id.header.size); if (ret) goto out; @@ -7863,7 +7967,7 @@ static void perf_event_cgroup_output(struct perf_event *event, void *data) perf_event_header__init_id(&cgroup_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, cgroup_event->event_id.header.size); if (ret) goto out; @@ -7989,7 +8093,7 @@ static void perf_event_mmap_output(struct perf_event *event, } perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, mmap_event->event_id.header.size); if (ret) goto out; @@ -8299,7 +8403,7 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head, int ret; perf_event_header__init_id(&rec.header, &sample, event); - ret = perf_output_begin(&handle, event, rec.header.size); + ret = perf_output_begin(&handle, &sample, event, rec.header.size); if (ret) return; @@ -8333,7 +8437,7 @@ void perf_log_lost_samples(struct perf_event *event, u64 lost) perf_event_header__init_id(&lost_samples_event.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, lost_samples_event.header.size); if (ret) return; @@ -8388,7 +8492,7 @@ static void perf_event_switch_output(struct perf_event *event, void *data) perf_event_header__init_id(&se->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, se->event_id.header.size); + ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size); if (ret) return; @@ -8463,7 +8567,7 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_event_header__init_id(&throttle_event.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, throttle_event.header.size); if (ret) return; @@ -8506,7 +8610,7 @@ static void perf_event_ksymbol_output(struct perf_event *event, void *data) perf_event_header__init_id(&ksymbol_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, ksymbol_event->event_id.header.size); if (ret) return; @@ -8596,7 +8700,7 @@ static void perf_event_bpf_output(struct perf_event *event, void *data) perf_event_header__init_id(&bpf_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, data, event, bpf_event->event_id.header.size); if (ret) return; @@ -8705,7 +8809,8 @@ static void perf_event_text_poke_output(struct perf_event *event, void *data) perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, text_poke_event->event_id.header.size); + ret = perf_output_begin(&handle, &sample, event, + text_poke_event->event_id.header.size); if (ret) return; @@ -8786,7 +8891,7 @@ static void perf_log_itrace_start(struct perf_event *event) rec.tid = perf_event_tid(event, current); perf_event_header__init_id(&rec.header, &sample, event); - ret = perf_output_begin(&handle, event, rec.header.size); + ret = perf_output_begin(&handle, &sample, event, rec.header.size); if (ret) return; @@ -10085,6 +10190,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { int fpos = token == IF_SRC_FILE ? 2 : 1; + kfree(filename); filename = match_strdup(&args[fpos]); if (!filename) { ret = -ENOMEM; @@ -10131,16 +10237,13 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, */ ret = -EOPNOTSUPP; if (!event->ctx->task) - goto fail_free_name; + goto fail; /* look up the path and grab its inode */ ret = kern_path(filename, LOOKUP_FOLLOW, &filter->path); if (ret) - goto fail_free_name; - - kfree(filename); - filename = NULL; + goto fail; ret = -EINVAL; if (!filter->path.dentry || @@ -10160,13 +10263,13 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, if (state != IF_STATE_ACTION) goto fail; + kfree(filename); kfree(orig); return 0; -fail_free_name: - kfree(filename); fail: + kfree(filename); free_filters_list(filters); kfree(orig); @@ -11729,24 +11832,6 @@ SYSCALL_DEFINE5(perf_event_open, goto err_task; } - if (task) { - err = mutex_lock_interruptible(&task->signal->exec_update_mutex); - if (err) - goto err_task; - - /* - * Preserve ptrace permission check for backwards compatibility. - * - * We must hold exec_update_mutex across this and any potential - * perf_install_in_context() call for this new event to - * serialize against exec() altering our credentials (and the - * perf_event_exit_task() that could imply). - */ - err = -EACCES; - if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) - goto err_cred; - } - if (flags & PERF_FLAG_PID_CGROUP) cgroup_fd = pid; @@ -11754,7 +11839,7 @@ SYSCALL_DEFINE5(perf_event_open, NULL, NULL, cgroup_fd); if (IS_ERR(event)) { err = PTR_ERR(event); - goto err_cred; + goto err_task; } if (is_sampling_event(event)) { @@ -11873,6 +11958,24 @@ SYSCALL_DEFINE5(perf_event_open, goto err_context; } + if (task) { + err = mutex_lock_interruptible(&task->signal->exec_update_mutex); + if (err) + goto err_file; + + /* + * Preserve ptrace permission check for backwards compatibility. + * + * We must hold exec_update_mutex across this and any potential + * perf_install_in_context() call for this new event to + * serialize against exec() altering our credentials (and the + * perf_event_exit_task() that could imply). + */ + err = -EACCES; + if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) + goto err_cred; + } + if (move_group) { gctx = __perf_event_ctx_lock_double(group_leader, ctx); @@ -12048,7 +12151,10 @@ err_locked: if (move_group) perf_event_ctx_unlock(group_leader, gctx); mutex_unlock(&ctx->mutex); -/* err_file: */ +err_cred: + if (task) + mutex_unlock(&task->signal->exec_update_mutex); +err_file: fput(event_file); err_context: perf_unpin_context(ctx); @@ -12060,9 +12166,6 @@ err_alloc: */ if (!event_file) free_event(event); -err_cred: - if (task) - mutex_unlock(&task->signal->exec_update_mutex); err_task: if (task) put_task_struct(task); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index fcbf5616a441..228801e20788 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -205,16 +205,12 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) static inline int get_recursion_context(int *recursion) { - int rctx; - - if (unlikely(in_nmi())) - rctx = 3; - else if (in_irq()) - rctx = 2; - else if (in_softirq()) - rctx = 1; - else - rctx = 0; + unsigned int pc = preempt_count(); + unsigned char rctx = 0; + + rctx += !!(pc & (NMI_MASK)); + rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK)); + rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); if (recursion[rctx]) return -1; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 192b8abc6330..ef91ae75ca56 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -147,6 +147,7 @@ ring_buffer_has_space(unsigned long head, unsigned long tail, static __always_inline int __perf_output_begin(struct perf_output_handle *handle, + struct perf_sample_data *data, struct perf_event *event, unsigned int size, bool backward) { @@ -237,18 +238,16 @@ __perf_output_begin(struct perf_output_handle *handle, handle->size = (1UL << page_shift) - offset; if (unlikely(have_lost)) { - struct perf_sample_data sample_data; - lost_event.header.size = sizeof(lost_event); lost_event.header.type = PERF_RECORD_LOST; lost_event.header.misc = 0; lost_event.id = event->id; lost_event.lost = local_xchg(&rb->lost, 0); - perf_event_header__init_id(&lost_event.header, - &sample_data, event); + /* XXX mostly redundant; @data is already fully initializes */ + perf_event_header__init_id(&lost_event.header, data, event); perf_output_put(handle, lost_event); - perf_event__output_id_sample(event, handle, &sample_data); + perf_event__output_id_sample(event, handle, data); } return 0; @@ -263,22 +262,25 @@ out: } int perf_output_begin_forward(struct perf_output_handle *handle, - struct perf_event *event, unsigned int size) + struct perf_sample_data *data, + struct perf_event *event, unsigned int size) { - return __perf_output_begin(handle, event, size, false); + return __perf_output_begin(handle, data, event, size, false); } int perf_output_begin_backward(struct perf_output_handle *handle, + struct perf_sample_data *data, struct perf_event *event, unsigned int size) { - return __perf_output_begin(handle, event, size, true); + return __perf_output_begin(handle, data, event, size, true); } int perf_output_begin(struct perf_output_handle *handle, + struct perf_sample_data *data, struct perf_event *event, unsigned int size) { - return __perf_output_begin(handle, event, size, + return __perf_output_begin(handle, data, event, size, unlikely(is_write_backward(event))); } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 00b0358739ab..bf9edd8d75be 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1973,7 +1973,7 @@ bool uprobe_deny_signal(void) WARN_ON_ONCE(utask->state != UTASK_SSTEP); - if (signal_pending(t)) { + if (task_sigpending(t)) { spin_lock_irq(&t->sighand->siglock); clear_tsk_thread_flag(t, TIF_SIGPENDING); spin_unlock_irq(&t->sighand->siglock); diff --git a/kernel/exit.c b/kernel/exit.c index 87a2d515de0d..3594291a8542 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -454,7 +454,10 @@ static void exit_mm(void) mmap_read_unlock(mm); self.task = current; - self.next = xchg(&core_state->dumper.next, &self); + if (self.task->flags & PF_SIGNALED) + self.next = xchg(&core_state->dumper.next, &self); + else + self.task = NULL; /* * Implies mb(), the result of xchg() must be visible * to core_state->dumper. @@ -475,10 +478,24 @@ static void exit_mm(void) BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ task_lock(current); + /* + * When a thread stops operating on an address space, the loop + * in membarrier_private_expedited() may not observe that + * tsk->mm, and the loop in membarrier_global_expedited() may + * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED + * rq->membarrier_state, so those would not issue an IPI. + * Membarrier requires a memory barrier after accessing + * user-space memory, before clearing tsk->mm or the + * rq->membarrier_state. + */ + smp_mb__after_spinlock(); + local_irq_disable(); current->mm = NULL; - mmap_read_unlock(mm); + membarrier_update_current_mm(NULL); enter_lazy_tlb(mm, current); + local_irq_enable(); task_unlock(current); + mmap_read_unlock(mm); mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 63b349168da7..b0b1ad93fa95 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -253,7 +253,7 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, if (copy_from_user(buf, buffer, count)) { ret = -EFAULT; - goto out; + goto out_free; } buf[count] = '\0'; sym = strstrip(buf); @@ -307,8 +307,9 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, ret = count; } out: - kfree(buf); mutex_unlock(&fei_lock); +out_free: + kfree(buf); return ret; } diff --git a/kernel/fork.c b/kernel/fork.c index 837b546528c8..4f44d87b82ef 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -385,7 +385,7 @@ static void account_kernel_stack(struct task_struct *tsk, int account) mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB, account * (THREAD_SIZE / 1024)); else - mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB, + mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB, account * (THREAD_SIZE / 1024)); } @@ -404,9 +404,10 @@ static int memcg_charge_kernel_stack(struct task_struct *tsk) for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { /* - * If memcg_kmem_charge_page() fails, page->mem_cgroup - * pointer is NULL, and memcg_kmem_uncharge_page() in - * free_thread_stack() will ignore this page. + * If memcg_kmem_charge_page() fails, page's + * memory cgroup pointer is NULL, and + * memcg_kmem_uncharge_page() in free_thread_stack() + * will ignore this page. */ ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); @@ -906,6 +907,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); set_task_stack_end_magic(tsk); + clear_syscall_work_syscall_user_dispatch(tsk); #ifdef CONFIG_STACKPROTECTOR tsk->stack_canary = get_random_canary(); @@ -930,6 +932,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) account_kernel_stack(tsk, 1); kcov_task_init(tsk); + kmap_local_fork(tsk); #ifdef CONFIG_FAULT_INJECTION tsk->fail_nth = 0; @@ -1007,6 +1010,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); + seqcount_init(&mm->write_protect_seq); mmap_init_lock(mm); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; @@ -1625,7 +1629,7 @@ static void copy_seccomp(struct task_struct *p) * to manually enable the seccomp thread flag here. */ if (p->seccomp.mode != SECCOMP_MODE_DISABLED) - set_tsk_thread_flag(p, TIF_SECCOMP); + set_task_syscall_work(p, SECCOMP); #endif } @@ -2158,23 +2162,18 @@ static __latent_entropy struct task_struct *copy_process( * child regardless of CLONE_PTRACE. */ user_disable_single_step(p); - clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); -#ifdef TIF_SYSCALL_EMU - clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); + clear_task_syscall_work(p, SYSCALL_TRACE); +#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU) + clear_task_syscall_work(p, SYSCALL_EMU); #endif clear_tsk_latency_tracing(p); /* ok, now we should be set up.. */ p->pid = pid_nr(pid); if (clone_flags & CLONE_THREAD) { - p->exit_signal = -1; p->group_leader = current->group_leader; p->tgid = current->tgid; } else { - if (clone_flags & CLONE_PARENT) - p->exit_signal = current->group_leader->exit_signal; - else - p->exit_signal = args->exit_signal; p->group_leader = p; p->tgid = p->pid; } @@ -2187,6 +2186,10 @@ static __latent_entropy struct task_struct *copy_process( INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; +#ifdef CONFIG_KRETPROBES + p->kretprobe_instances.first = NULL; +#endif + /* * Ensure that the cgroup subsystem policies allow the new process to be * forked. It should be noted that the new process's css_set can be changed @@ -2218,9 +2221,14 @@ static __latent_entropy struct task_struct *copy_process( if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; p->parent_exec_id = current->parent_exec_id; + if (clone_flags & CLONE_THREAD) + p->exit_signal = -1; + else + p->exit_signal = current->group_leader->exit_signal; } else { p->real_parent = current; p->parent_exec_id = current->self_exec_id; + p->exit_signal = args->exit_signal; } klp_copy_process(p); diff --git a/kernel/futex.c b/kernel/futex.c index be68ac0d49ad..c47d1015d759 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -310,8 +310,6 @@ static inline bool should_fail_futex(bool fshared) #ifdef CONFIG_COMPAT static void compat_exit_robust_list(struct task_struct *curr); -#else -static inline void compat_exit_robust_list(struct task_struct *curr) { } #endif /* @@ -788,8 +786,9 @@ static void put_pi_state(struct futex_pi_state *pi_state) */ if (pi_state->owner) { struct task_struct *owner; + unsigned long flags; - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags); owner = pi_state->owner; if (owner) { raw_spin_lock(&owner->pi_lock); @@ -797,7 +796,7 @@ static void put_pi_state(struct futex_pi_state *pi_state) raw_spin_unlock(&owner->pi_lock); } rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner); - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags); } if (current->pi_state_cache) { @@ -1503,8 +1502,10 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ */ newval = FUTEX_WAITERS | task_pid_vnr(new_owner); - if (unlikely(should_fail_futex(true))) + if (unlikely(should_fail_futex(true))) { ret = -EFAULT; + goto out_unlock; + } ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); if (!ret && (curval != uval)) { @@ -2378,10 +2379,22 @@ retry: } /* - * Since we just failed the trylock; there must be an owner. + * The trylock just failed, so either there is an owner or + * there is a higher priority waiter than this one. */ newowner = rt_mutex_owner(&pi_state->pi_mutex); - BUG_ON(!newowner); + /* + * If the higher priority waiter has not yet taken over the + * rtmutex then newowner is NULL. We can't return here with + * that state because it's inconsistent vs. the user space + * state. So drop the locks and try again. It's a valid + * situation and not any different from the other retry + * conditions. + */ + if (unlikely(!newowner)) { + err = -EAGAIN; + goto handle_err; + } } else { WARN_ON_ONCE(argowner != current); if (oldowner == current) { diff --git a/kernel/hung_task.c b/kernel/hung_task.c index ce76f490126c..396ebaebea3f 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -225,8 +225,7 @@ static long hung_timeout_jiffies(unsigned long last_checked, * Process updating of timeout sysctl */ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 10a5aff4eecc..d79ef2493a28 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -26,11 +26,6 @@ config GENERIC_IRQ_SHOW_LEVEL config GENERIC_IRQ_EFFECTIVE_AFF_MASK bool -# Facility to allocate a hardware interrupt. This is legacy support -# and should not be used in new code. Use irq domains instead. -config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ - bool - # Support for delayed migration from interrupt context config GENERIC_PENDING_IRQ bool @@ -82,6 +77,7 @@ config IRQ_FASTEOI_HIERARCHY_HANDLERS # Generic IRQ IPI support config GENERIC_IRQ_IPI bool + select IRQ_DOMAIN_HIERARCHY # Generic MSI interrupt support config GENERIC_MSI_IRQ diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b9b9618e1aca..6d89e33fe3aa 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -61,7 +61,7 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip) EXPORT_SYMBOL(irq_set_chip); /** - * irq_set_type - set the irq trigger type for an irq + * irq_set_irq_type - set the irq trigger type for an irq * @irq: irq number * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h */ @@ -945,33 +945,6 @@ void handle_percpu_devid_irq(struct irq_desc *desc) } /** - * handle_percpu_devid_fasteoi_ipi - Per CPU local IPI handler with per cpu - * dev ids - * @desc: the interrupt description structure for this irq - * - * The biggest difference with the IRQ version is that the interrupt is - * EOIed early, as the IPI could result in a context switch, and we need to - * make sure the IPI can fire again. We also assume that the arch code has - * registered an action. If not, we are positively doomed. - */ -void handle_percpu_devid_fasteoi_ipi(struct irq_desc *desc) -{ - struct irq_chip *chip = irq_desc_get_chip(desc); - struct irqaction *action = desc->action; - unsigned int irq = irq_desc_get_irq(desc); - irqreturn_t res; - - __kstat_incr_irqs_this_cpu(desc); - - if (chip->irq_eoi) - chip->irq_eoi(&desc->irq_data); - - trace_irq_handler_entry(irq, action); - res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); - trace_irq_handler_exit(irq, action, res); -} - -/** * handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu * dev ids * @desc: the interrupt description structure for this irq diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index e2999a070a99..a23ac2bbf433 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -269,7 +269,7 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) } /** - * __irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain + * __irq_alloc_domain_generic_chips - Allocate generic chips for an irq domain * @d: irq domain for which to allocate chips * @irqs_per_chip: Number of interrupts each chip handles (max 32) * @num_ct: Number of irq_chip_type instances associated with this diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 1a7723604399..e810eb9906ea 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -810,57 +810,6 @@ unlock: } EXPORT_SYMBOL_GPL(__irq_alloc_descs); -#ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ -/** - * irq_alloc_hwirqs - Allocate an irq descriptor and initialize the hardware - * @cnt: number of interrupts to allocate - * @node: node on which to allocate - * - * Returns an interrupt number > 0 or 0, if the allocation fails. - */ -unsigned int irq_alloc_hwirqs(int cnt, int node) -{ - int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL, NULL); - - if (irq < 0) - return 0; - - for (i = irq; cnt > 0; i++, cnt--) { - if (arch_setup_hwirq(i, node)) - goto err; - irq_clear_status_flags(i, _IRQ_NOREQUEST); - } - return irq; - -err: - for (i--; i >= irq; i--) { - irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); - arch_teardown_hwirq(i); - } - irq_free_descs(irq, cnt); - return 0; -} -EXPORT_SYMBOL_GPL(irq_alloc_hwirqs); - -/** - * irq_free_hwirqs - Free irq descriptor and cleanup the hardware - * @from: Free from irq number - * @cnt: number of interrupts to free - * - */ -void irq_free_hwirqs(unsigned int from, int cnt) -{ - int i, j; - - for (i = from, j = cnt; j > 0; i++, j--) { - irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); - arch_teardown_hwirq(i); - } - irq_free_descs(from, cnt); -} -EXPORT_SYMBOL_GPL(irq_free_hwirqs); -#endif - /** * irq_get_next_irq - get next allocated irq number * @offset: where to start the search diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index cf8b374b892d..6aacd342cd14 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -42,7 +42,16 @@ static inline void debugfs_add_domain_dir(struct irq_domain *d) { } static inline void debugfs_remove_domain_dir(struct irq_domain *d) { } #endif -const struct fwnode_operations irqchip_fwnode_ops; +static const char *irqchip_fwnode_get_name(const struct fwnode_handle *fwnode) +{ + struct irqchip_fwid *fwid = container_of(fwnode, struct irqchip_fwid, fwnode); + + return fwid->name; +} + +const struct fwnode_operations irqchip_fwnode_ops = { + .get_name = irqchip_fwnode_get_name, +}; EXPORT_SYMBOL_GPL(irqchip_fwnode_ops); /** @@ -91,7 +100,7 @@ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id, fwid->type = type; fwid->name = n; fwid->pa = pa; - fwid->fwnode.ops = &irqchip_fwnode_ops; + fwnode_init(&fwid->fwnode, &irqchip_fwnode_ops); return &fwid->fwnode; } EXPORT_SYMBOL_GPL(__irq_domain_alloc_fwnode); @@ -351,16 +360,27 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, const struct irq_domain_ops *ops, void *host_data) { + return irq_domain_create_legacy(of_node_to_fwnode(of_node), size, + first_irq, first_hwirq, ops, host_data); +} +EXPORT_SYMBOL_GPL(irq_domain_add_legacy); + +struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode, + unsigned int size, + unsigned int first_irq, + irq_hw_number_t first_hwirq, + const struct irq_domain_ops *ops, + void *host_data) +{ struct irq_domain *domain; - domain = __irq_domain_add(of_node_to_fwnode(of_node), first_hwirq + size, - first_hwirq + size, 0, ops, host_data); + domain = __irq_domain_add(fwnode, first_hwirq + size, first_hwirq + size, 0, ops, host_data); if (domain) irq_domain_associate_many(domain, first_irq, first_hwirq, size); return domain; } -EXPORT_SYMBOL_GPL(irq_domain_add_legacy); +EXPORT_SYMBOL_GPL(irq_domain_create_legacy); /** * irq_find_matching_fwspec() - Locates a domain for a given fwspec @@ -485,7 +505,7 @@ static void irq_domain_set_mapping(struct irq_domain *domain, } } -void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) +static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) { struct irq_data *irq_data = irq_get_irq_data(irq); irq_hw_number_t hwirq; @@ -624,17 +644,19 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) EXPORT_SYMBOL_GPL(irq_create_direct_mapping); /** - * irq_create_mapping() - Map a hardware interrupt into linux irq space + * irq_create_mapping_affinity() - Map a hardware interrupt into linux irq space * @domain: domain owning this hardware interrupt or NULL for default domain * @hwirq: hardware irq number in that domain space + * @affinity: irq affinity * * Only one mapping per hardware interrupt is permitted. Returns a linux * irq number. * If the sense/trigger is to be specified, set_irq_type() should be called * on the number returned from that call. */ -unsigned int irq_create_mapping(struct irq_domain *domain, - irq_hw_number_t hwirq) +unsigned int irq_create_mapping_affinity(struct irq_domain *domain, + irq_hw_number_t hwirq, + const struct irq_affinity_desc *affinity) { struct device_node *of_node; int virq; @@ -660,7 +682,8 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } /* Allocate a virtual interrupt number */ - virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), NULL); + virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), + affinity); if (virq <= 0) { pr_debug("-> virq allocation failed\n"); return 0; @@ -676,7 +699,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, return virq; } -EXPORT_SYMBOL_GPL(irq_create_mapping); +EXPORT_SYMBOL_GPL(irq_create_mapping_affinity); /** * irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs @@ -737,7 +760,7 @@ static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, { int i; - fwspec->fwnode = np ? &np->fwnode : NULL; + fwspec->fwnode = of_node_to_fwnode(np); fwspec->param_count = count; for (i = 0; i < count; i++) @@ -1370,8 +1393,15 @@ static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs) { - if (domain->ops->free) - domain->ops->free(domain, irq_base, nr_irqs); + unsigned int i; + + if (!domain->ops->free) + return; + + for (i = 0; i < nr_irqs; i++) { + if (irq_domain_get_irq_data(domain, irq_base + i)) + domain->ops->free(domain, irq_base + i, 1); + } } int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c460e0496006..c826ba4141fe 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -371,6 +371,76 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, return ret; } +/** + * irq_update_affinity_desc - Update affinity management for an interrupt + * @irq: The interrupt number to update + * @affinity: Pointer to the affinity descriptor + * + * This interface can be used to configure the affinity management of + * interrupts which have been allocated already. + * + * There are certain limitations on when it may be used - attempts to use it + * for when the kernel is configured for generic IRQ reservation mode (in + * config GENERIC_IRQ_RESERVATION_MODE) will fail, as it may conflict with + * managed/non-managed interrupt accounting. In addition, attempts to use it on + * an interrupt which is already started or which has already been configured + * as managed will also fail, as these mean invalid init state or double init. + */ +int irq_update_affinity_desc(unsigned int irq, + struct irq_affinity_desc *affinity) +{ + struct irq_desc *desc; + unsigned long flags; + bool activated; + int ret = 0; + + /* + * Supporting this with the reservation scheme used by x86 needs + * some more thought. Fail it for now. + */ + if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE)) + return -EOPNOTSUPP; + + desc = irq_get_desc_buslock(irq, &flags, 0); + if (!desc) + return -EINVAL; + + /* Requires the interrupt to be shut down */ + if (irqd_is_started(&desc->irq_data)) { + ret = -EBUSY; + goto out_unlock; + } + + /* Interrupts which are already managed cannot be modified */ + if (irqd_affinity_is_managed(&desc->irq_data)) { + ret = -EBUSY; + goto out_unlock; + } + + /* + * Deactivate the interrupt. That's required to undo + * anything an earlier activation has established. + */ + activated = irqd_is_activated(&desc->irq_data); + if (activated) + irq_domain_deactivate_irq(&desc->irq_data); + + if (affinity->is_managed) { + irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED); + irqd_set(&desc->irq_data, IRQD_MANAGED_SHUTDOWN); + } + + cpumask_copy(desc->irq_common_data.affinity, &affinity->mask); + + /* Restore the activation state */ + if (activated) + irq_domain_activate_irq(&desc->irq_data, false); + +out_unlock: + irq_put_desc_busunlock(desc, flags); + return ret; +} + int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force) { struct irq_desc *desc = irq_to_desc(irq); diff --git a/kernel/irq_work.c b/kernel/irq_work.c index eca83965b631..e8da1e71583a 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -31,10 +31,10 @@ static bool irq_work_claim(struct irq_work *work) { int oflags; - oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->flags); + oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->node.a_flags); /* * If the work is already pending, no need to raise the IPI. - * The pairing atomic_fetch_andnot() in irq_work_run() makes sure + * The pairing smp_mb() in irq_work_single() makes sure * everything we did before is visible. */ if (oflags & IRQ_WORK_PENDING) @@ -53,12 +53,12 @@ void __weak arch_irq_work_raise(void) static void __irq_work_queue_local(struct irq_work *work) { /* If the work is "lazy", handle it from next tick if any */ - if (atomic_read(&work->flags) & IRQ_WORK_LAZY) { - if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && + if (atomic_read(&work->node.a_flags) & IRQ_WORK_LAZY) { + if (llist_add(&work->node.llist, this_cpu_ptr(&lazy_list)) && tick_nohz_tick_stopped()) arch_irq_work_raise(); } else { - if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) + if (llist_add(&work->node.llist, this_cpu_ptr(&raised_list))) arch_irq_work_raise(); } } @@ -102,7 +102,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (cpu != smp_processor_id()) { /* Arch remote IPI send/receive backend aren't NMI safe */ WARN_ON_ONCE(in_nmi()); - __smp_call_single_queue(cpu, &work->llnode); + __smp_call_single_queue(cpu, &work->node.llist); } else { __irq_work_queue_local(work); } @@ -136,23 +136,28 @@ void irq_work_single(void *arg) int flags; /* - * Clear the PENDING bit, after this point the @work - * can be re-used. - * Make it immediately visible so that other CPUs trying - * to claim that work don't rely on us to handle their data - * while we are in the middle of the func. + * Clear the PENDING bit, after this point the @work can be re-used. + * The PENDING bit acts as a lock, and we own it, so we can clear it + * without atomic ops. */ - flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags); + flags = atomic_read(&work->node.a_flags); + flags &= ~IRQ_WORK_PENDING; + atomic_set(&work->node.a_flags, flags); + + /* + * See irq_work_claim(). + */ + smp_mb(); - lockdep_irq_work_enter(work); + lockdep_irq_work_enter(flags); work->func(work); - lockdep_irq_work_exit(work); + lockdep_irq_work_exit(flags); + /* - * Clear the BUSY bit and return to the free state if - * no-one else claimed it meanwhile. + * Clear the BUSY bit, if set, and return to the free state if no-one + * else claimed it meanwhile. */ - flags &= ~IRQ_WORK_PENDING; - (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); + (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY); } static void irq_work_run_list(struct llist_head *list) @@ -166,7 +171,7 @@ static void irq_work_run_list(struct llist_head *list) return; llnode = llist_del_all(list); - llist_for_each_entry_safe(work, tmp, llnode, llnode) + llist_for_each_entry_safe(work, tmp, llnode, node.llist) irq_work_single(work); } @@ -198,7 +203,7 @@ void irq_work_sync(struct irq_work *work) { lockdep_assert_irqs_enabled(); - while (atomic_read(&work->flags) & IRQ_WORK_BUSY) + while (irq_work_is_busy(work)) cpu_relax(); } EXPORT_SYMBOL_GPL(irq_work_sync); diff --git a/kernel/kcov.c b/kernel/kcov.c index 6b8368be89c8..80bfe71bbe13 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -1023,6 +1023,8 @@ EXPORT_SYMBOL(kcov_remote_stop); /* See the comment before kcov_remote_start() for usage details. */ u64 kcov_common_handle(void) { + if (!in_task()) + return 0; return current->kcov_handle; } EXPORT_SYMBOL(kcov_common_handle); diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h index 1a6db2f797ac..7ee405524904 100644 --- a/kernel/kcsan/encoding.h +++ b/kernel/kcsan/encoding.h @@ -37,18 +37,20 @@ */ #define WATCHPOINT_ADDR_BITS (BITS_PER_LONG-1 - WATCHPOINT_SIZE_BITS) -/* - * Masks to set/retrieve the encoded data. - */ -#define WATCHPOINT_WRITE_MASK BIT(BITS_PER_LONG-1) -#define WATCHPOINT_SIZE_MASK \ - GENMASK(BITS_PER_LONG-2, BITS_PER_LONG-2 - WATCHPOINT_SIZE_BITS) -#define WATCHPOINT_ADDR_MASK \ - GENMASK(BITS_PER_LONG-3 - WATCHPOINT_SIZE_BITS, 0) +/* Bitmasks for the encoded watchpoint access information. */ +#define WATCHPOINT_WRITE_MASK BIT(BITS_PER_LONG-1) +#define WATCHPOINT_SIZE_MASK GENMASK(BITS_PER_LONG-2, WATCHPOINT_ADDR_BITS) +#define WATCHPOINT_ADDR_MASK GENMASK(WATCHPOINT_ADDR_BITS-1, 0) +static_assert(WATCHPOINT_ADDR_MASK == (1UL << WATCHPOINT_ADDR_BITS) - 1); +static_assert((WATCHPOINT_WRITE_MASK ^ WATCHPOINT_SIZE_MASK ^ WATCHPOINT_ADDR_MASK) == ~0UL); static inline bool check_encodable(unsigned long addr, size_t size) { - return size <= MAX_ENCODABLE_SIZE; + /* + * While we can encode addrs<PAGE_SIZE, avoid crashing with a NULL + * pointer deref inside KCSAN. + */ + return addr >= PAGE_SIZE && size <= MAX_ENCODABLE_SIZE; } static inline long diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c index d98bc208d06d..9014a3a82cf9 100644 --- a/kernel/kcsan/selftest.c +++ b/kernel/kcsan/selftest.c @@ -33,6 +33,9 @@ static bool test_encode_decode(void) unsigned long addr; prandom_bytes(&addr, sizeof(addr)); + if (addr < PAGE_SIZE) + addr = PAGE_SIZE; + if (WARN_ON(!check_encodable(addr, size))) return false; diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 8798a8183974..4f8efc278aa7 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -42,7 +42,6 @@ #include <asm/sections.h> #include <crypto/hash.h> -#include <crypto/sha.h> #include "kexec_internal.h" DEFINE_MUTEX(kexec_mutex); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index e21f6b9234f7..b02086d70492 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -20,7 +20,7 @@ #include <linux/fs.h> #include <linux/ima.h> #include <crypto/hash.h> -#include <crypto/sha.h> +#include <crypto/sha2.h> #include <linux/elf.h> #include <linux/elfcore.h> #include <linux/kernel.h> diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 8a12a25fa40d..f7fb5d135930 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -36,7 +36,6 @@ #include <linux/cpu.h> #include <linux/jump_label.h> #include <linux/perf_event.h> -#include <linux/static_call.h> #include <asm/sections.h> #include <asm/cacheflush.h> @@ -54,7 +53,6 @@ static int kprobes_initialized; * - RCU hlist traversal under disabling preempt (breakpoint handlers) */ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; -static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; /* NOTE: change this value only with kprobe_mutex held */ static bool kprobes_all_disarmed; @@ -62,9 +60,6 @@ static bool kprobes_all_disarmed; /* This protects kprobe_table and optimizing_list */ static DEFINE_MUTEX(kprobe_mutex); static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; -static struct { - raw_spinlock_t lock ____cacheline_aligned_in_smp; -} kretprobe_table_locks[KPROBE_TABLE_SIZE]; kprobe_opcode_t * __weak kprobe_lookup_name(const char *name, unsigned int __unused) @@ -72,11 +67,6 @@ kprobe_opcode_t * __weak kprobe_lookup_name(const char *name, return ((kprobe_opcode_t *)(kallsyms_lookup_name(name))); } -static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) -{ - return &(kretprobe_table_locks[hash].lock); -} - /* Blacklist -- list of struct kprobe_blacklist_entry */ static LIST_HEAD(kprobe_blacklist); @@ -1224,64 +1214,26 @@ void kprobes_inc_nmissed_count(struct kprobe *p) } NOKPROBE_SYMBOL(kprobes_inc_nmissed_count); -static void recycle_rp_inst(struct kretprobe_instance *ri) +static void free_rp_inst_rcu(struct rcu_head *head) { - struct kretprobe *rp = ri->rp; + struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu); - /* remove rp inst off the rprobe_inst_table */ - hlist_del(&ri->hlist); - INIT_HLIST_NODE(&ri->hlist); - if (likely(rp)) { - raw_spin_lock(&rp->lock); - hlist_add_head(&ri->hlist, &rp->free_instances); - raw_spin_unlock(&rp->lock); - } else - kfree_rcu(ri, rcu); + if (refcount_dec_and_test(&ri->rph->ref)) + kfree(ri->rph); + kfree(ri); } -NOKPROBE_SYMBOL(recycle_rp_inst); +NOKPROBE_SYMBOL(free_rp_inst_rcu); -static void kretprobe_hash_lock(struct task_struct *tsk, - struct hlist_head **head, unsigned long *flags) -__acquires(hlist_lock) -{ - unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); - raw_spinlock_t *hlist_lock; - - *head = &kretprobe_inst_table[hash]; - hlist_lock = kretprobe_table_lock_ptr(hash); - raw_spin_lock_irqsave(hlist_lock, *flags); -} -NOKPROBE_SYMBOL(kretprobe_hash_lock); - -static void kretprobe_table_lock(unsigned long hash, - unsigned long *flags) -__acquires(hlist_lock) -{ - raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); - raw_spin_lock_irqsave(hlist_lock, *flags); -} -NOKPROBE_SYMBOL(kretprobe_table_lock); - -static void kretprobe_hash_unlock(struct task_struct *tsk, - unsigned long *flags) -__releases(hlist_lock) +static void recycle_rp_inst(struct kretprobe_instance *ri) { - unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); - raw_spinlock_t *hlist_lock; + struct kretprobe *rp = get_kretprobe(ri); - hlist_lock = kretprobe_table_lock_ptr(hash); - raw_spin_unlock_irqrestore(hlist_lock, *flags); -} -NOKPROBE_SYMBOL(kretprobe_hash_unlock); - -static void kretprobe_table_unlock(unsigned long hash, - unsigned long *flags) -__releases(hlist_lock) -{ - raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); - raw_spin_unlock_irqrestore(hlist_lock, *flags); + if (likely(rp)) { + freelist_add(&ri->freelist, &rp->freelist); + } else + call_rcu(&ri->rcu, free_rp_inst_rcu); } -NOKPROBE_SYMBOL(kretprobe_table_unlock); +NOKPROBE_SYMBOL(recycle_rp_inst); static struct kprobe kprobe_busy = { .addr = (void *) get_kprobe, @@ -1312,24 +1264,21 @@ void kprobe_busy_end(void) void kprobe_flush_task(struct task_struct *tk) { struct kretprobe_instance *ri; - struct hlist_head *head; - struct hlist_node *tmp; - unsigned long hash, flags = 0; + struct llist_node *node; + /* Early boot, not yet initialized. */ if (unlikely(!kprobes_initialized)) - /* Early boot. kretprobe_table_locks not yet initialized. */ return; kprobe_busy_begin(); - hash = hash_ptr(tk, KPROBE_HASH_BITS); - head = &kretprobe_inst_table[hash]; - kretprobe_table_lock(hash, &flags); - hlist_for_each_entry_safe(ri, tmp, head, hlist) { - if (ri->task == tk) - recycle_rp_inst(ri); + node = __llist_del_all(&tk->kretprobe_instances); + while (node) { + ri = container_of(node, struct kretprobe_instance, llist); + node = node->next; + + recycle_rp_inst(ri); } - kretprobe_table_unlock(hash, &flags); kprobe_busy_end(); } @@ -1338,37 +1287,23 @@ NOKPROBE_SYMBOL(kprobe_flush_task); static inline void free_rp_inst(struct kretprobe *rp) { struct kretprobe_instance *ri; - struct hlist_node *next; + struct freelist_node *node; + int count = 0; + + node = rp->freelist.head; + while (node) { + ri = container_of(node, struct kretprobe_instance, freelist); + node = node->next; - hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) { - hlist_del(&ri->hlist); kfree(ri); + count++; } -} -static void cleanup_rp_inst(struct kretprobe *rp) -{ - unsigned long flags, hash; - struct kretprobe_instance *ri; - struct hlist_node *next; - struct hlist_head *head; - - /* To avoid recursive kretprobe by NMI, set kprobe busy here */ - kprobe_busy_begin(); - for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) { - kretprobe_table_lock(hash, &flags); - head = &kretprobe_inst_table[hash]; - hlist_for_each_entry_safe(ri, next, head, hlist) { - if (ri->rp == rp) - ri->rp = NULL; - } - kretprobe_table_unlock(hash, &flags); + if (refcount_sub_and_test(count, &rp->rph->ref)) { + kfree(rp->rph); + rp->rph = NULL; } - kprobe_busy_end(); - - free_rp_inst(rp); } -NOKPROBE_SYMBOL(cleanup_rp_inst); /* Add the new probe to ap->list */ static int add_new_kprobe(struct kprobe *ap, struct kprobe *p) @@ -1631,7 +1566,6 @@ static int check_kprobe_address_safe(struct kprobe *p, if (!kernel_text_address((unsigned long) p->addr) || within_kprobe_blacklist((unsigned long) p->addr) || jump_label_text_reserved(p->addr, p->addr) || - static_call_text_reserved(p->addr, p->addr) || find_bug((unsigned long)p->addr)) { ret = -EINVAL; goto out; @@ -1930,88 +1864,56 @@ unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, void *trampoline_address, void *frame_pointer) { - struct kretprobe_instance *ri = NULL, *last = NULL; - struct hlist_head *head; - struct hlist_node *tmp; - unsigned long flags; kprobe_opcode_t *correct_ret_addr = NULL; - bool skipped = false; + struct kretprobe_instance *ri = NULL; + struct llist_node *first, *node; + struct kretprobe *rp; - kretprobe_hash_lock(current, &head, &flags); + /* Find all nodes for this frame. */ + first = node = current->kretprobe_instances.first; + while (node) { + ri = container_of(node, struct kretprobe_instance, llist); - /* - * It is possible to have multiple instances associated with a given - * task either because multiple functions in the call path have - * return probes installed on them, and/or more than one - * return probe was registered for a target function. - * - * We can handle this because: - * - instances are always pushed into the head of the list - * - when multiple return probes are registered for the same - * function, the (chronologically) first instance's ret_addr - * will be the real return address, and all the rest will - * point to kretprobe_trampoline. - */ - hlist_for_each_entry(ri, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - /* - * Return probes must be pushed on this hash list correct - * order (same as return order) so that it can be popped - * correctly. However, if we find it is pushed it incorrect - * order, this means we find a function which should not be - * probed, because the wrong order entry is pushed on the - * path of processing other kretprobe itself. - */ - if (ri->fp != frame_pointer) { - if (!skipped) - pr_warn("kretprobe is stacked incorrectly. Trying to fixup.\n"); - skipped = true; - continue; - } - - correct_ret_addr = ri->ret_addr; - if (skipped) - pr_warn("%ps must be blacklisted because of incorrect kretprobe order\n", - ri->rp->kp.addr); + BUG_ON(ri->fp != frame_pointer); - if (correct_ret_addr != trampoline_address) + if (ri->ret_addr != trampoline_address) { + correct_ret_addr = ri->ret_addr; /* * This is the real return address. Any other * instances associated with this task are for * other calls deeper on the call stack */ - break; + goto found; + } + + node = node->next; } + pr_err("Oops! Kretprobe fails to find correct return address.\n"); + BUG_ON(1); - BUG_ON(!correct_ret_addr || (correct_ret_addr == trampoline_address)); - last = ri; +found: + /* Unlink all nodes for this frame. */ + current->kretprobe_instances.first = node->next; + node->next = NULL; - hlist_for_each_entry_safe(ri, tmp, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - if (ri->fp != frame_pointer) - continue; + /* Run them.. */ + while (first) { + ri = container_of(first, struct kretprobe_instance, llist); + first = first->next; - if (ri->rp && ri->rp->handler) { + rp = get_kretprobe(ri); + if (rp && rp->handler) { struct kprobe *prev = kprobe_running(); - __this_cpu_write(current_kprobe, &ri->rp->kp); + __this_cpu_write(current_kprobe, &rp->kp); ri->ret_addr = correct_ret_addr; - ri->rp->handler(ri, regs); + rp->handler(ri, regs); __this_cpu_write(current_kprobe, prev); } recycle_rp_inst(ri); - - if (ri == last) - break; } - kretprobe_hash_unlock(current, &flags); - return (unsigned long)correct_ret_addr; } NOKPROBE_SYMBOL(__kretprobe_trampoline_handler) @@ -2023,39 +1925,26 @@ NOKPROBE_SYMBOL(__kretprobe_trampoline_handler) static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) { struct kretprobe *rp = container_of(p, struct kretprobe, kp); - unsigned long hash, flags = 0; struct kretprobe_instance *ri; + struct freelist_node *fn; - /* TODO: consider to only swap the RA after the last pre_handler fired */ - hash = hash_ptr(current, KPROBE_HASH_BITS); - raw_spin_lock_irqsave(&rp->lock, flags); - if (!hlist_empty(&rp->free_instances)) { - ri = hlist_entry(rp->free_instances.first, - struct kretprobe_instance, hlist); - hlist_del(&ri->hlist); - raw_spin_unlock_irqrestore(&rp->lock, flags); - - ri->rp = rp; - ri->task = current; - - if (rp->entry_handler && rp->entry_handler(ri, regs)) { - raw_spin_lock_irqsave(&rp->lock, flags); - hlist_add_head(&ri->hlist, &rp->free_instances); - raw_spin_unlock_irqrestore(&rp->lock, flags); - return 0; - } + fn = freelist_try_get(&rp->freelist); + if (!fn) { + rp->nmissed++; + return 0; + } - arch_prepare_kretprobe(ri, regs); + ri = container_of(fn, struct kretprobe_instance, freelist); - /* XXX(hch): why is there no hlist_move_head? */ - INIT_HLIST_NODE(&ri->hlist); - kretprobe_table_lock(hash, &flags); - hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]); - kretprobe_table_unlock(hash, &flags); - } else { - rp->nmissed++; - raw_spin_unlock_irqrestore(&rp->lock, flags); + if (rp->entry_handler && rp->entry_handler(ri, regs)) { + freelist_add(&ri->freelist, &rp->freelist); + return 0; } + + arch_prepare_kretprobe(ri, regs); + + __llist_add(&ri->llist, ¤t->kretprobe_instances); + return 0; } NOKPROBE_SYMBOL(pre_handler_kretprobe); @@ -2112,18 +2001,24 @@ int register_kretprobe(struct kretprobe *rp) rp->maxactive = num_possible_cpus(); #endif } - raw_spin_lock_init(&rp->lock); - INIT_HLIST_HEAD(&rp->free_instances); + rp->freelist.head = NULL; + rp->rph = kzalloc(sizeof(struct kretprobe_holder), GFP_KERNEL); + if (!rp->rph) + return -ENOMEM; + + rp->rph->rp = rp; for (i = 0; i < rp->maxactive; i++) { - inst = kmalloc(sizeof(struct kretprobe_instance) + + inst = kzalloc(sizeof(struct kretprobe_instance) + rp->data_size, GFP_KERNEL); if (inst == NULL) { + refcount_set(&rp->rph->ref, i); free_rp_inst(rp); return -ENOMEM; } - INIT_HLIST_NODE(&inst->hlist); - hlist_add_head(&inst->hlist, &rp->free_instances); + inst->rph = rp->rph; + freelist_add(&inst->freelist, &rp->freelist); } + refcount_set(&rp->rph->ref, i); rp->nmissed = 0; /* Establish function entry probe point */ @@ -2165,16 +2060,18 @@ void unregister_kretprobes(struct kretprobe **rps, int num) if (num <= 0) return; mutex_lock(&kprobe_mutex); - for (i = 0; i < num; i++) + for (i = 0; i < num; i++) { if (__unregister_kprobe_top(&rps[i]->kp) < 0) rps[i]->kp.addr = NULL; + rps[i]->rph->rp = NULL; + } mutex_unlock(&kprobe_mutex); synchronize_rcu(); for (i = 0; i < num; i++) { if (rps[i]->kp.addr) { __unregister_kprobe_bottom(&rps[i]->kp); - cleanup_rp_inst(rps[i]); + free_rp_inst(rps[i]); } } } @@ -2218,9 +2115,6 @@ static void kill_kprobe(struct kprobe *p) lockdep_assert_held(&kprobe_mutex); - if (WARN_ON_ONCE(kprobe_gone(p))) - return; - p->flags |= KPROBE_FLAG_GONE; if (kprobe_aggrprobe(p)) { /* @@ -2501,10 +2395,7 @@ static int kprobes_module_callback(struct notifier_block *nb, mutex_lock(&kprobe_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry(p, head, hlist) { - if (kprobe_gone(p)) - continue; - + hlist_for_each_entry(p, head, hlist) if (within_module_init((unsigned long)p->addr, mod) || (checkcore && within_module_core((unsigned long)p->addr, mod))) { @@ -2521,7 +2412,6 @@ static int kprobes_module_callback(struct notifier_block *nb, */ kill_kprobe(p); } - } } if (val == MODULE_STATE_GOING) remove_module_kprobe_blacklist(mod); @@ -2566,11 +2456,8 @@ static int __init init_kprobes(void) /* FIXME allocate the probe table, currently defined statically */ /* initialize all list heads */ - for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + for (i = 0; i < KPROBE_TABLE_SIZE; i++) INIT_HLIST_HEAD(&kprobe_table[i]); - INIT_HLIST_HEAD(&kretprobe_inst_table[i]); - raw_spin_lock_init(&(kretprobe_table_locks[i].lock)); - } err = populate_kprobe_blacklist(__start_kprobe_blacklist, __stop_kprobe_blacklist); diff --git a/kernel/kthread.c b/kernel/kthread.c index e29773c82b70..a5eceecd4513 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -704,8 +704,15 @@ repeat: raw_spin_unlock_irq(&worker->lock); if (work) { + kthread_work_func_t func = work->func; __set_current_state(TASK_RUNNING); + trace_sched_kthread_work_execute_start(work); work->func(work); + /* + * Avoid dereferencing work after this point. The trace + * event only cares about the address. + */ + trace_sched_kthread_work_execute_end(work, func); } else if (!freezing(current)) schedule(); @@ -786,7 +793,25 @@ EXPORT_SYMBOL(kthread_create_worker); * A good practice is to add the cpu number also into the worker name. * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu). * - * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) + * CPU hotplug: + * The kthread worker API is simple and generic. It just provides a way + * to create, use, and destroy workers. + * + * It is up to the API user how to handle CPU hotplug. They have to decide + * how to handle pending work items, prevent queuing new ones, and + * restore the functionality when the CPU goes off and on. There are a + * few catches: + * + * - CPU affinity gets lost when it is scheduled on an offline CPU. + * + * - The worker might not exist when the CPU was off when the user + * created the workers. + * + * Good practice is to implement two CPU hotplug callbacks and to + * destroy/create the worker when the CPU goes down/up. + * + * Return: + * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the worker was SIGKILLed. */ @@ -834,6 +859,8 @@ static void kthread_insert_work(struct kthread_worker *worker, { kthread_insert_work_sanity_check(worker, work); + trace_sched_kthread_work_queue_work(worker, work); + list_add_tail(&work->node, pos); work->worker = worker; if (!worker->current_work && likely(worker->task)) @@ -897,7 +924,8 @@ void kthread_delayed_work_timer_fn(struct timer_list *t) /* Move the work from worker->delayed_work_list. */ WARN_ON_ONCE(list_empty(&work->node)); list_del_init(&work->node); - kthread_insert_work(worker, work, &worker->work_list); + if (!work->canceling) + kthread_insert_work(worker, work, &worker->work_list); raw_spin_unlock_irqrestore(&worker->lock, flags); } @@ -1248,6 +1276,7 @@ void kthread_use_mm(struct mm_struct *mm) tsk->active_mm = mm; } tsk->mm = mm; + membarrier_update_current_mm(mm); switch_mm_irqs_off(active_mm, mm, tsk); local_irq_enable(); task_unlock(tsk); @@ -1255,8 +1284,19 @@ void kthread_use_mm(struct mm_struct *mm) finish_arch_post_lock_switch(); #endif + /* + * When a kthread starts operating on an address space, the loop + * in membarrier_{private,global}_expedited() may not observe + * that tsk->mm, and not issue an IPI. Membarrier requires a + * memory barrier after storing to tsk->mm, before accessing + * user-space memory. A full memory barrier for membarrier + * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by + * mmdrop(), or explicitly with smp_mb(). + */ if (active_mm != mm) mmdrop(active_mm); + else + smp_mb(); to_kthread(tsk)->oldfs = force_uaccess_begin(); } @@ -1276,9 +1316,18 @@ void kthread_unuse_mm(struct mm_struct *mm) force_uaccess_end(to_kthread(tsk)->oldfs); task_lock(tsk); + /* + * When a kthread stops operating on an address space, the loop + * in membarrier_{private,global}_expedited() may not observe + * that tsk->mm, and not issue an IPI. Membarrier requires a + * memory barrier after accessing user-space memory, before + * clearing tsk->mm. + */ + smp_mb__after_spinlock(); sync_mm_rss(mm); local_irq_disable(); tsk->mm = NULL; + membarrier_update_current_mm(NULL); /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); local_irq_enable(); diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index 239039d0ce21..97fb6f3f840a 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h @@ -56,13 +56,11 @@ LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */ LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */ LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */ LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */ -LOCK_EVENT(rwsem_opt_rlock) /* # of opt-acquired read locks */ -LOCK_EVENT(rwsem_opt_wlock) /* # of opt-acquired write locks */ +LOCK_EVENT(rwsem_opt_lock) /* # of opt-acquired write locks */ LOCK_EVENT(rwsem_opt_fail) /* # of failed optspins */ LOCK_EVENT(rwsem_opt_nospin) /* # of disabled optspins */ -LOCK_EVENT(rwsem_opt_norspin) /* # of disabled reader-only optspins */ -LOCK_EVENT(rwsem_opt_rlock2) /* # of opt-acquired 2ndary read locks */ LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */ +LOCK_EVENT(rwsem_rlock_steal) /* # of read locks by lock stealing */ LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */ LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */ LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */ diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 3e99dfef8408..c1418b47f625 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -84,7 +84,7 @@ static inline bool lockdep_enabled(void) if (!debug_locks) return false; - if (raw_cpu_read(lockdep_recursion)) + if (this_cpu_read(lockdep_recursion)) return false; if (current->lockdep_recursion) @@ -108,19 +108,21 @@ static inline void lockdep_lock(void) { DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + __this_cpu_inc(lockdep_recursion); arch_spin_lock(&__lock); __owner = current; - __this_cpu_inc(lockdep_recursion); } static inline void lockdep_unlock(void) { + DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + if (debug_locks && DEBUG_LOCKS_WARN_ON(__owner != current)) return; - __this_cpu_dec(lockdep_recursion); __owner = NULL; arch_spin_unlock(&__lock); + __this_cpu_dec(lockdep_recursion); } static inline bool lockdep_assert_locked(void) @@ -2765,7 +2767,9 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, * (Note that this has to be done separately, because the graph cannot * detect such classes of deadlocks.) * - * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read + * Returns: 0 on deadlock detected, 1 on OK, 2 if another lock with the same + * lock class is held but nest_lock is also held, i.e. we rely on the + * nest_lock to avoid the deadlock. */ static int check_deadlock(struct task_struct *curr, struct held_lock *next) @@ -2788,7 +2792,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next) * lock class (i.e. read_lock(lock)+read_lock(lock)): */ if ((next->read == 2) && prev->read) - return 2; + continue; /* * We're holding the nest_lock, which serializes this lock's @@ -3593,15 +3597,12 @@ static int validate_chain(struct task_struct *curr, if (!ret) return 0; /* - * Mark recursive read, as we jump over it when - * building dependencies (just like we jump over - * trylock entries): - */ - if (ret == 2) - hlock->read = 2; - /* * Add dependency only if this lock is not the head - * of the chain, and if it's not a secondary read-lock: + * of the chain, and if the new lock introduces no more + * lock dependency (because we already hold a lock with the + * same lock class) nor deadlock (because the nest_lock + * serializes nesting locks), see the comments for + * check_deadlock(). */ if (!chain_head && ret != 2) { if (!check_prevs_add(curr, hlock)) @@ -4057,7 +4058,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) if (unlikely(in_nmi())) return; - if (unlikely(__this_cpu_read(lockdep_recursion))) + if (unlikely(this_cpu_read(lockdep_recursion))) return; if (unlikely(lockdep_hardirqs_enabled())) { @@ -4126,7 +4127,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) goto skip_checks; } - if (unlikely(__this_cpu_read(lockdep_recursion))) + if (unlikely(this_cpu_read(lockdep_recursion))) return; if (lockdep_hardirqs_enabled()) { @@ -4396,6 +4397,9 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, if (unlikely(hlock_class(this)->usage_mask & new_mask)) goto unlock; + if (!hlock_class(this)->usage_mask) + debug_atomic_dec(nr_unused_locks); + hlock_class(this)->usage_mask |= new_mask; if (new_bit < LOCK_TRACE_STATES) { @@ -4403,19 +4407,10 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return 0; } - switch (new_bit) { - case 0 ... LOCK_USED-1: + if (new_bit < LOCK_USED) { ret = mark_lock_irq(curr, this, new_bit); if (!ret) return 0; - break; - - case LOCK_USED: - debug_atomic_dec(nr_unused_locks); - break; - - default: - break; } unlock: diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 62d215b2e39f..fd838cea3934 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -29,6 +29,7 @@ #include <linux/slab.h> #include <linux/percpu-rwsem.h> #include <linux/torture.h> +#include <linux/reboot.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); @@ -60,6 +61,7 @@ static struct task_struct **reader_tasks; static bool lock_is_write_held; static bool lock_is_read_held; +static unsigned long last_lock_release; struct lock_stress_stats { long n_lock_fail; @@ -74,6 +76,7 @@ static void lock_torture_cleanup(void); */ struct lock_torture_ops { void (*init)(void); + void (*exit)(void); int (*writelock)(void); void (*write_delay)(struct torture_random_state *trsp); void (*task_boost)(struct torture_random_state *trsp); @@ -90,12 +93,13 @@ struct lock_torture_cxt { int nrealwriters_stress; int nrealreaders_stress; bool debug_lock; + bool init_called; atomic_t n_lock_torture_errors; struct lock_torture_ops *cur_ops; struct lock_stress_stats *lwsa; /* writer statistics */ struct lock_stress_stats *lrsa; /* reader statistics */ }; -static struct lock_torture_cxt cxt = { 0, 0, false, +static struct lock_torture_cxt cxt = { 0, 0, false, false, ATOMIC_INIT(0), NULL, NULL}; /* @@ -571,6 +575,11 @@ static void torture_percpu_rwsem_init(void) BUG_ON(percpu_init_rwsem(&pcpu_rwsem)); } +static void torture_percpu_rwsem_exit(void) +{ + percpu_free_rwsem(&pcpu_rwsem); +} + static int torture_percpu_rwsem_down_write(void) __acquires(pcpu_rwsem) { percpu_down_write(&pcpu_rwsem); @@ -595,6 +604,7 @@ static void torture_percpu_rwsem_up_read(void) __releases(pcpu_rwsem) static struct lock_torture_ops percpu_rwsem_lock_ops = { .init = torture_percpu_rwsem_init, + .exit = torture_percpu_rwsem_exit, .writelock = torture_percpu_rwsem_down_write, .write_delay = torture_rwsem_write_delay, .task_boost = torture_boost_dummy, @@ -632,6 +642,7 @@ static int lock_torture_writer(void *arg) lwsp->n_lock_acquired++; cxt.cur_ops->write_delay(&rand); lock_is_write_held = false; + WRITE_ONCE(last_lock_release, jiffies); cxt.cur_ops->writeunlock(); stutter_wait("lock_torture_writer"); @@ -786,9 +797,10 @@ static void lock_torture_cleanup(void) /* * Indicates early cleanup, meaning that the test has not run, - * such as when passing bogus args when loading the module. As - * such, only perform the underlying torture-specific cleanups, - * and avoid anything related to locktorture. + * such as when passing bogus args when loading the module. + * However cxt->cur_ops.init() may have been invoked, so beside + * perform the underlying torture-specific cleanups, cur_ops.exit() + * will be invoked if needed. */ if (!cxt.lwsa && !cxt.lrsa) goto end; @@ -828,6 +840,11 @@ static void lock_torture_cleanup(void) cxt.lrsa = NULL; end: + if (cxt.init_called) { + if (cxt.cur_ops->exit) + cxt.cur_ops->exit(); + cxt.init_called = false; + } torture_cleanup_end(); } @@ -868,14 +885,17 @@ static int __init lock_torture_init(void) goto unwind; } - if (nwriters_stress == 0 && nreaders_stress == 0) { + if (nwriters_stress == 0 && + (!cxt.cur_ops->readlock || nreaders_stress == 0)) { pr_alert("lock-torture: must run at least one locking thread\n"); firsterr = -EINVAL; goto unwind; } - if (cxt.cur_ops->init) + if (cxt.cur_ops->init) { cxt.cur_ops->init(); + cxt.init_called = true; + } if (nwriters_stress >= 0) cxt.nrealwriters_stress = nwriters_stress; @@ -1038,6 +1058,10 @@ static int __init lock_torture_init(void) unwind: torture_init_end(); lock_torture_cleanup(); + if (shutdown_secs) { + WARN_ON(!IS_MODULE(CONFIG_LOCK_TORTURE_TEST)); + kernel_power_off(); + } return firsterr; } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index f11b9bd3431d..ba67600c7b2c 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -31,19 +31,13 @@ #include "lock_events.h" /* - * The least significant 3 bits of the owner value has the following + * The least significant 2 bits of the owner value has the following * meanings when set. * - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers - * - Bit 1: RWSEM_RD_NONSPINNABLE - Readers cannot spin on this lock. - * - Bit 2: RWSEM_WR_NONSPINNABLE - Writers cannot spin on this lock. + * - Bit 1: RWSEM_NONSPINNABLE - Cannot spin on a reader-owned lock * - * When the rwsem is either owned by an anonymous writer, or it is - * reader-owned, but a spinning writer has timed out, both nonspinnable - * bits will be set to disable optimistic spinning by readers and writers. - * In the later case, the last unlocking reader should then check the - * writer nonspinnable bit and clear it only to give writers preference - * to acquire the lock via optimistic spinning, but not readers. Similar - * action is also done in the reader slowpath. + * When the rwsem is reader-owned and a spinning writer has timed out, + * the nonspinnable bit will be set to disable optimistic spinning. * When a writer acquires a rwsem, it puts its task_struct pointer * into the owner field. It is cleared after an unlock. @@ -59,46 +53,14 @@ * is involved. Ideally we would like to track all the readers that own * a rwsem, but the overhead is simply too big. * - * Reader optimistic spinning is helpful when the reader critical section - * is short and there aren't that many readers around. It makes readers - * relatively more preferred than writers. When a writer times out spinning - * on a reader-owned lock and set the nospinnable bits, there are two main - * reasons for that. - * - * 1) The reader critical section is long, perhaps the task sleeps after - * acquiring the read lock. - * 2) There are just too many readers contending the lock causing it to - * take a while to service all of them. - * - * In the former case, long reader critical section will impede the progress - * of writers which is usually more important for system performance. In - * the later case, reader optimistic spinning tends to make the reader - * groups that contain readers that acquire the lock together smaller - * leading to more of them. That may hurt performance in some cases. In - * other words, the setting of nonspinnable bits indicates that reader - * optimistic spinning may not be helpful for those workloads that cause - * it. - * - * Therefore, any writers that had observed the setting of the writer - * nonspinnable bit for a given rwsem after they fail to acquire the lock - * via optimistic spinning will set the reader nonspinnable bit once they - * acquire the write lock. Similarly, readers that observe the setting - * of reader nonspinnable bit at slowpath entry will set the reader - * nonspinnable bits when they acquire the read lock via the wakeup path. - * - * Once the reader nonspinnable bit is on, it will only be reset when - * a writer is able to acquire the rwsem in the fast path or somehow a - * reader or writer in the slowpath doesn't observe the nonspinable bit. - * - * This is to discourage reader optmistic spinning on that particular - * rwsem and make writers more preferred. This adaptive disabling of reader - * optimistic spinning will alleviate the negative side effect of this - * feature. + * A fast path reader optimistic lock stealing is supported when the rwsem + * is previously owned by a writer and the following conditions are met: + * - OSQ is empty + * - rwsem is not currently writer owned + * - the handoff isn't set. */ #define RWSEM_READER_OWNED (1UL << 0) -#define RWSEM_RD_NONSPINNABLE (1UL << 1) -#define RWSEM_WR_NONSPINNABLE (1UL << 2) -#define RWSEM_NONSPINNABLE (RWSEM_RD_NONSPINNABLE | RWSEM_WR_NONSPINNABLE) +#define RWSEM_NONSPINNABLE (1UL << 1) #define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE) #ifdef CONFIG_DEBUG_RWSEMS @@ -203,7 +165,7 @@ static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, struct task_struct *owner) { unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED | - (atomic_long_read(&sem->owner) & RWSEM_RD_NONSPINNABLE); + (atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE); atomic_long_set(&sem->owner, val); } @@ -270,12 +232,31 @@ static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem) owner | RWSEM_NONSPINNABLE)); } -static inline bool rwsem_read_trylock(struct rw_semaphore *sem) +static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp) { - long cnt = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count); - if (WARN_ON_ONCE(cnt < 0)) + *cntp = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count); + + if (WARN_ON_ONCE(*cntp < 0)) rwsem_set_nonspinnable(sem); - return !(cnt & RWSEM_READ_FAILED_MASK); + + if (!(*cntp & RWSEM_READ_FAILED_MASK)) { + rwsem_set_reader_owned(sem); + return true; + } + + return false; +} + +static inline bool rwsem_write_trylock(struct rw_semaphore *sem) +{ + long tmp = RWSEM_UNLOCKED_VALUE; + + if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) { + rwsem_set_owner(sem); + return true; + } + + return false; } /* @@ -353,7 +334,6 @@ struct rwsem_waiter { struct task_struct *task; enum rwsem_waiter_type type; unsigned long timeout; - unsigned long last_rowner; }; #define rwsem_first_waiter(sem) \ list_first_entry(&sem->wait_list, struct rwsem_waiter, list) @@ -467,10 +447,6 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * the reader is copied over. */ owner = waiter->task; - if (waiter->last_rowner & RWSEM_RD_NONSPINNABLE) { - owner = (void *)((unsigned long)owner | RWSEM_RD_NONSPINNABLE); - lockevent_inc(rwsem_opt_norspin); - } __rwsem_set_reader_owned(sem, owner); } @@ -602,30 +578,6 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, #ifdef CONFIG_RWSEM_SPIN_ON_OWNER /* - * Try to acquire read lock before the reader is put on wait queue. - * Lock acquisition isn't allowed if the rwsem is locked or a writer handoff - * is ongoing. - */ -static inline bool rwsem_try_read_lock_unqueued(struct rw_semaphore *sem) -{ - long count = atomic_long_read(&sem->count); - - if (count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF)) - return false; - - count = atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, &sem->count); - if (!(count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { - rwsem_set_reader_owned(sem); - lockevent_inc(rwsem_opt_rlock); - return true; - } - - /* Back out the change */ - atomic_long_add(-RWSEM_READER_BIAS, &sem->count); - return false; -} - -/* * Try to acquire write lock before the writer has been put on wait queue. */ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) @@ -636,7 +588,7 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, count | RWSEM_WRITER_LOCKED)) { rwsem_set_owner(sem); - lockevent_inc(rwsem_opt_wlock); + lockevent_inc(rwsem_opt_lock); return true; } } @@ -652,8 +604,7 @@ static inline bool owner_on_cpu(struct task_struct *owner) return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); } -static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, - unsigned long nonspinnable) +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) { struct task_struct *owner; unsigned long flags; @@ -670,7 +621,7 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, /* * Don't check the read-owner as the entry may be stale. */ - if ((flags & nonspinnable) || + if ((flags & RWSEM_NONSPINNABLE) || (owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner))) ret = false; rcu_read_unlock(); @@ -700,9 +651,9 @@ enum owner_state { #define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER) static inline enum owner_state -rwsem_owner_state(struct task_struct *owner, unsigned long flags, unsigned long nonspinnable) +rwsem_owner_state(struct task_struct *owner, unsigned long flags) { - if (flags & nonspinnable) + if (flags & RWSEM_NONSPINNABLE) return OWNER_NONSPINNABLE; if (flags & RWSEM_READER_OWNED) @@ -712,14 +663,14 @@ rwsem_owner_state(struct task_struct *owner, unsigned long flags, unsigned long } static noinline enum owner_state -rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) +rwsem_spin_on_owner(struct rw_semaphore *sem) { struct task_struct *new, *owner; unsigned long flags, new_flags; enum owner_state state; owner = rwsem_owner_flags(sem, &flags); - state = rwsem_owner_state(owner, flags, nonspinnable); + state = rwsem_owner_state(owner, flags); if (state != OWNER_WRITER) return state; @@ -733,7 +684,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) */ new = rwsem_owner_flags(sem, &new_flags); if ((new != owner) || (new_flags != flags)) { - state = rwsem_owner_state(new, new_flags, nonspinnable); + state = rwsem_owner_state(new, new_flags); break; } @@ -782,14 +733,12 @@ static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem) return sched_clock() + delta; } -static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) +static bool rwsem_optimistic_spin(struct rw_semaphore *sem) { bool taken = false; int prev_owner_state = OWNER_NULL; int loop = 0; u64 rspin_threshold = 0; - unsigned long nonspinnable = wlock ? RWSEM_WR_NONSPINNABLE - : RWSEM_RD_NONSPINNABLE; preempt_disable(); @@ -806,15 +755,14 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) for (;;) { enum owner_state owner_state; - owner_state = rwsem_spin_on_owner(sem, nonspinnable); + owner_state = rwsem_spin_on_owner(sem); if (!(owner_state & OWNER_SPINNABLE)) break; /* * Try to acquire the lock */ - taken = wlock ? rwsem_try_write_lock_unqueued(sem) - : rwsem_try_read_lock_unqueued(sem); + taken = rwsem_try_write_lock_unqueued(sem); if (taken) break; @@ -822,7 +770,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) /* * Time-based reader-owned rwsem optimistic spinning */ - if (wlock && (owner_state == OWNER_READER)) { + if (owner_state == OWNER_READER) { /* * Re-initialize rspin_threshold every time when * the owner state changes from non-reader to reader. @@ -831,7 +779,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) * the beginning of the 2nd reader phase. */ if (prev_owner_state != OWNER_READER) { - if (rwsem_test_oflags(sem, nonspinnable)) + if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE)) break; rspin_threshold = rwsem_rspin_threshold(sem); loop = 0; @@ -907,78 +855,30 @@ done: } /* - * Clear the owner's RWSEM_WR_NONSPINNABLE bit if it is set. This should + * Clear the owner's RWSEM_NONSPINNABLE bit if it is set. This should * only be called when the reader count reaches 0. - * - * This give writers better chance to acquire the rwsem first before - * readers when the rwsem was being held by readers for a relatively long - * period of time. Race can happen that an optimistic spinner may have - * just stolen the rwsem and set the owner, but just clearing the - * RWSEM_WR_NONSPINNABLE bit will do no harm anyway. */ -static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) +static inline void clear_nonspinnable(struct rw_semaphore *sem) { - if (rwsem_test_oflags(sem, RWSEM_WR_NONSPINNABLE)) - atomic_long_andnot(RWSEM_WR_NONSPINNABLE, &sem->owner); + if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE)) + atomic_long_andnot(RWSEM_NONSPINNABLE, &sem->owner); } -/* - * This function is called when the reader fails to acquire the lock via - * optimistic spinning. In this case we will still attempt to do a trylock - * when comparing the rwsem state right now with the state when entering - * the slowpath indicates that the reader is still in a valid reader phase. - * This happens when the following conditions are true: - * - * 1) The lock is currently reader owned, and - * 2) The lock is previously not reader-owned or the last read owner changes. - * - * In the former case, we have transitioned from a writer phase to a - * reader-phase while spinning. In the latter case, it means the reader - * phase hasn't ended when we entered the optimistic spinning loop. In - * both cases, the reader is eligible to acquire the lock. This is the - * secondary path where a read lock is acquired optimistically. - * - * The reader non-spinnable bit wasn't set at time of entry or it will - * not be here at all. - */ -static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, - unsigned long last_rowner) -{ - unsigned long owner = atomic_long_read(&sem->owner); - - if (!(owner & RWSEM_READER_OWNED)) - return false; - - if (((owner ^ last_rowner) & ~RWSEM_OWNER_FLAGS_MASK) && - rwsem_try_read_lock_unqueued(sem)) { - lockevent_inc(rwsem_opt_rlock2); - lockevent_add(rwsem_opt_fail, -1); - return true; - } - return false; -} #else -static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, - unsigned long nonspinnable) +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) { return false; } -static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) +static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem) { return false; } -static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) { } - -static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, - unsigned long last_rowner) -{ - return false; -} +static inline void clear_nonspinnable(struct rw_semaphore *sem) { } static inline int -rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) +rwsem_spin_on_owner(struct rw_semaphore *sem) { return 0; } @@ -989,36 +889,35 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) * Wait for the read lock to be granted */ static struct rw_semaphore __sched * -rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) +rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, int state) { - long count, adjustment = -RWSEM_READER_BIAS; + long adjustment = -RWSEM_READER_BIAS; + long rcnt = (count >> RWSEM_READER_SHIFT); struct rwsem_waiter waiter; DEFINE_WAKE_Q(wake_q); bool wake = false; /* - * Save the current read-owner of rwsem, if available, and the - * reader nonspinnable bit. + * To prevent a constant stream of readers from starving a sleeping + * waiter, don't attempt optimistic lock stealing if the lock is + * currently owned by readers. */ - waiter.last_rowner = atomic_long_read(&sem->owner); - if (!(waiter.last_rowner & RWSEM_READER_OWNED)) - waiter.last_rowner &= RWSEM_RD_NONSPINNABLE; - - if (!rwsem_can_spin_on_owner(sem, RWSEM_RD_NONSPINNABLE)) + if ((atomic_long_read(&sem->owner) & RWSEM_READER_OWNED) && + (rcnt > 1) && !(count & RWSEM_WRITER_LOCKED)) goto queue; /* - * Undo read bias from down_read() and do optimistic spinning. + * Reader optimistic lock stealing. */ - atomic_long_add(-RWSEM_READER_BIAS, &sem->count); - adjustment = 0; - if (rwsem_optimistic_spin(sem, false)) { - /* rwsem_optimistic_spin() implies ACQUIRE on success */ + if (!(count & (RWSEM_WRITER_LOCKED | RWSEM_FLAG_HANDOFF))) { + rwsem_set_reader_owned(sem); + lockevent_inc(rwsem_rlock_steal); + /* - * Wake up other readers in the wait list if the front - * waiter is a reader. + * Wake up other readers in the wait queue if it is + * the first reader. */ - if ((atomic_long_read(&sem->count) & RWSEM_FLAG_WAITERS)) { + if ((rcnt == 1) && (count & RWSEM_FLAG_WAITERS)) { raw_spin_lock_irq(&sem->wait_lock); if (!list_empty(&sem->wait_list)) rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, @@ -1027,9 +926,6 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) wake_up_q(&wake_q); } return sem; - } else if (rwsem_reader_phase_trylock(sem, waiter.last_rowner)) { - /* rwsem_reader_phase_trylock() implies ACQUIRE on success */ - return sem; } queue: @@ -1045,7 +941,7 @@ queue: * exit the slowpath and return immediately as its * RWSEM_READER_BIAS has already been set in the count. */ - if (adjustment && !(atomic_long_read(&sem->count) & + if (!(atomic_long_read(&sem->count) & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { /* Provide lock ACQUIRE */ smp_acquire__after_ctrl_dep(); @@ -1059,10 +955,7 @@ queue: list_add_tail(&waiter.list, &sem->wait_list); /* we're now waiting on the lock, but no longer actively locking */ - if (adjustment) - count = atomic_long_add_return(adjustment, &sem->count); - else - count = atomic_long_read(&sem->count); + count = atomic_long_add_return(adjustment, &sem->count); /* * If there are no active locks, wake the front queued process(es). @@ -1071,7 +964,7 @@ queue: * wake our own waiter to join the existing active readers ! */ if (!(count & RWSEM_LOCK_MASK)) { - clear_wr_nonspinnable(sem); + clear_nonspinnable(sem); wake = true; } if (wake || (!(count & RWSEM_WRITER_MASK) && @@ -1117,46 +1010,24 @@ out_nolock: } /* - * This function is called by the a write lock owner. So the owner value - * won't get changed by others. - */ -static inline void rwsem_disable_reader_optspin(struct rw_semaphore *sem, - bool disable) -{ - if (unlikely(disable)) { - atomic_long_or(RWSEM_RD_NONSPINNABLE, &sem->owner); - lockevent_inc(rwsem_opt_norspin); - } -} - -/* * Wait until we successfully acquire the write lock */ static struct rw_semaphore * rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { long count; - bool disable_rspin; enum writer_wait_state wstate; struct rwsem_waiter waiter; struct rw_semaphore *ret = sem; DEFINE_WAKE_Q(wake_q); /* do optimistic spinning and steal lock if possible */ - if (rwsem_can_spin_on_owner(sem, RWSEM_WR_NONSPINNABLE) && - rwsem_optimistic_spin(sem, true)) { + if (rwsem_can_spin_on_owner(sem) && rwsem_optimistic_spin(sem)) { /* rwsem_optimistic_spin() implies ACQUIRE on success */ return sem; } /* - * Disable reader optimistic spinning for this rwsem after - * acquiring the write lock when the setting of the nonspinnable - * bits are observed. - */ - disable_rspin = atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE; - - /* * Optimistic spinning failed, proceed to the slowpath * and block until we can acquire the sem. */ @@ -1224,7 +1095,7 @@ wait: * without sleeping. */ if (wstate == WRITER_HANDOFF && - rwsem_spin_on_owner(sem, RWSEM_NONSPINNABLE) == OWNER_NULL) + rwsem_spin_on_owner(sem) == OWNER_NULL) goto trylock_again; /* Block until there are no active lockers. */ @@ -1266,7 +1137,6 @@ trylock_again: } __set_current_state(TASK_RUNNING); list_del(&waiter.list); - rwsem_disable_reader_optspin(sem, disable_rspin); raw_spin_unlock_irq(&sem->wait_lock); lockevent_inc(rwsem_wlock); @@ -1335,26 +1205,31 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) /* * lock for reading */ -static inline void __down_read(struct rw_semaphore *sem) +static inline int __down_read_common(struct rw_semaphore *sem, int state) { - if (!rwsem_read_trylock(sem)) { - rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE); + long count; + + if (!rwsem_read_trylock(sem, &count)) { + if (IS_ERR(rwsem_down_read_slowpath(sem, count, state))) + return -EINTR; DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); - } else { - rwsem_set_reader_owned(sem); } + return 0; +} + +static inline void __down_read(struct rw_semaphore *sem) +{ + __down_read_common(sem, TASK_UNINTERRUPTIBLE); +} + +static inline int __down_read_interruptible(struct rw_semaphore *sem) +{ + return __down_read_common(sem, TASK_INTERRUPTIBLE); } static inline int __down_read_killable(struct rw_semaphore *sem) { - if (!rwsem_read_trylock(sem)) { - if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE))) - return -EINTR; - DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); - } else { - rwsem_set_reader_owned(sem); - } - return 0; + return __down_read_common(sem, TASK_KILLABLE); } static inline int __down_read_trylock(struct rw_semaphore *sem) @@ -1380,44 +1255,30 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write(struct rw_semaphore *sem) +static inline int __down_write_common(struct rw_semaphore *sem, int state) { - long tmp = RWSEM_UNLOCKED_VALUE; + if (unlikely(!rwsem_write_trylock(sem))) { + if (IS_ERR(rwsem_down_write_slowpath(sem, state))) + return -EINTR; + } - if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, - RWSEM_WRITER_LOCKED))) - rwsem_down_write_slowpath(sem, TASK_UNINTERRUPTIBLE); - else - rwsem_set_owner(sem); + return 0; } -static inline int __down_write_killable(struct rw_semaphore *sem) +static inline void __down_write(struct rw_semaphore *sem) { - long tmp = RWSEM_UNLOCKED_VALUE; + __down_write_common(sem, TASK_UNINTERRUPTIBLE); +} - if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, - RWSEM_WRITER_LOCKED))) { - if (IS_ERR(rwsem_down_write_slowpath(sem, TASK_KILLABLE))) - return -EINTR; - } else { - rwsem_set_owner(sem); - } - return 0; +static inline int __down_write_killable(struct rw_semaphore *sem) +{ + return __down_write_common(sem, TASK_KILLABLE); } static inline int __down_write_trylock(struct rw_semaphore *sem) { - long tmp; - DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); - - tmp = RWSEM_UNLOCKED_VALUE; - if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, - RWSEM_WRITER_LOCKED)) { - rwsem_set_owner(sem); - return true; - } - return false; + return rwsem_write_trylock(sem); } /* @@ -1435,7 +1296,7 @@ static inline void __up_read(struct rw_semaphore *sem) DEBUG_RWSEMS_WARN_ON(tmp < 0, sem); if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == RWSEM_FLAG_WAITERS)) { - clear_wr_nonspinnable(sem); + clear_nonspinnable(sem); rwsem_wake(sem, tmp); } } @@ -1495,6 +1356,20 @@ void __sched down_read(struct rw_semaphore *sem) } EXPORT_SYMBOL(down_read); +int __sched down_read_interruptible(struct rw_semaphore *sem) +{ + might_sleep(); + rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_interruptible)) { + rwsem_release(&sem->dep_map, _RET_IP_); + return -EINTR; + } + + return 0; +} +EXPORT_SYMBOL(down_read_interruptible); + int __sched down_read_killable(struct rw_semaphore *sem) { might_sleep(); @@ -1605,6 +1480,20 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) } EXPORT_SYMBOL(down_read_nested); +int down_read_killable_nested(struct rw_semaphore *sem, int subclass) +{ + might_sleep(); + rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) { + rwsem_release(&sem->dep_map, _RET_IP_); + return -EINTR; + } + + return 0; +} +EXPORT_SYMBOL(down_read_killable_nested); + void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) { might_sleep(); diff --git a/kernel/module.c b/kernel/module.c index a4fa44a652a7..c3a9e972d3b2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -380,6 +380,35 @@ static void *section_objs(const struct load_info *info, return (void *)info->sechdrs[sec].sh_addr; } +/* Find a module section: 0 means not found. Ignores SHF_ALLOC flag. */ +static unsigned int find_any_sec(const struct load_info *info, const char *name) +{ + unsigned int i; + + for (i = 1; i < info->hdr->e_shnum; i++) { + Elf_Shdr *shdr = &info->sechdrs[i]; + if (strcmp(info->secstrings + shdr->sh_name, name) == 0) + return i; + } + return 0; +} + +/* + * Find a module section, or NULL. Fill in number of "objects" in section. + * Ignores SHF_ALLOC flag. + */ +static __maybe_unused void *any_section_objs(const struct load_info *info, + const char *name, + size_t object_size, + unsigned int *num) +{ + unsigned int sec = find_any_sec(info, name); + + /* Section 0 has sh_addr 0 and sh_size 0. */ + *num = info->sechdrs[sec].sh_size / object_size; + return (void *)info->sechdrs[sec].sh_addr; +} + /* Provided by the linker */ extern const struct kernel_symbol __start___ksymtab[]; extern const struct kernel_symbol __stop___ksymtab[]; @@ -3250,6 +3279,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->bpf_raw_events), &mod->num_bpf_raw_events); #endif +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + mod->btf_data = any_section_objs(info, ".BTF", 1, &mod->btf_data_size); +#endif #ifdef CONFIG_JUMP_LABEL mod->jump_entries = section_objs(info, "__jump_table", sizeof(*mod->jump_entries), @@ -3677,6 +3709,10 @@ static noinline int do_init_module(struct module *mod) mod->init_layout.ro_size = 0; mod->init_layout.ro_after_init_size = 0; mod->init_layout.text_size = 0; +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + /* .BTF is not SHF_ALLOC and will get removed, so sanitize pointer */ + mod->btf_data = NULL; +#endif /* * We want to free module_init, but be aware that kallsyms may be * walking this with preempt disabled. In all the failure paths, we diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 12dd41b39a7f..abc01fcad8c7 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -153,7 +153,6 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) struct nsproxy *old_ns = tsk->nsproxy; struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); struct nsproxy *new_ns; - int ret; if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWNET | @@ -173,18 +172,14 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) * it along with CLONE_NEWIPC. */ if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) == - (CLONE_NEWIPC | CLONE_SYSVSEM)) + (CLONE_NEWIPC | CLONE_SYSVSEM)) return -EINVAL; new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs); if (IS_ERR(new_ns)) return PTR_ERR(new_ns); - ret = timens_on_fork(new_ns, tsk); - if (ret) { - free_nsproxy(new_ns); - return ret; - } + timens_on_fork(new_ns, tsk); tsk->nsproxy = new_ns; return 0; @@ -250,8 +245,8 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) p->nsproxy = new; task_unlock(p); - if (ns && atomic_dec_and_test(&ns->count)) - free_nsproxy(ns); + if (ns) + put_nsproxy(ns); } void exit_task_namespaces(struct task_struct *p) diff --git a/kernel/panic.c b/kernel/panic.c index 396142ee43fd..332736a72a58 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -605,7 +605,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, panic("panic_on_warn set ...\n"); } - dump_stack(); + if (!regs) + dump_stack(); print_irqtrace_events(current); diff --git a/kernel/params.c b/kernel/params.c index 3835fb82c64b..164d79330849 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -530,7 +530,7 @@ struct module_param_attrs { unsigned int num; struct attribute_group grp; - struct param_attribute attrs[0]; + struct param_attribute attrs[]; }; #ifdef CONFIG_SYSFS diff --git a/kernel/pid.c b/kernel/pid.c index a96bc4bf4f86..47466d0bbc5b 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -73,7 +73,7 @@ int pid_max_max = PID_MAX_LIMIT; * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { - .kref = KREF_INIT(2), + .ns.count = REFCOUNT_INIT(2), .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 9de21803a8ae..ca43239a255a 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -102,7 +102,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns goto out_free_idr; ns->ns.ops = &pidns_operations; - kref_init(&ns->kref); + refcount_set(&ns->ns.count, 1); ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); @@ -148,22 +148,15 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, return create_pid_namespace(user_ns, old_ns); } -static void free_pid_ns(struct kref *kref) -{ - struct pid_namespace *ns; - - ns = container_of(kref, struct pid_namespace, kref); - destroy_pid_namespace(ns); -} - void put_pid_ns(struct pid_namespace *ns) { struct pid_namespace *parent; while (ns != &init_pid_ns) { parent = ns->parent; - if (!kref_put(&ns->kref, free_pid_ns)) + if (!refcount_dec_and_test(&ns->ns.count)) break; + destroy_pid_namespace(ns); ns = parent; } } diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index c1ff7fa030ab..1358fa4abfa8 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -52,6 +52,17 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused) } DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); +static int em_debug_units_show(struct seq_file *s, void *unused) +{ + struct em_perf_domain *pd = s->private; + char *units = pd->milliwatts ? "milliWatts" : "bogoWatts"; + + seq_printf(s, "%s\n", units); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(em_debug_units); + static void em_debug_create_pd(struct device *dev) { struct dentry *d; @@ -64,6 +75,8 @@ static void em_debug_create_pd(struct device *dev) debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus, &em_debug_cpus_fops); + debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops); + /* Create a sub-directory for each performance state */ for (i = 0; i < dev->em_pd->nr_perf_states; i++) em_debug_create_ps(&dev->em_pd->table[i], d); @@ -130,7 +143,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, /* * The power returned by active_state() is expected to be - * positive, in milli-watts and to fit into 16 bits. + * positive and to fit into 16 bits. */ if (!power || power > EM_MAX_POWER) { dev_err(dev, "EM: invalid power: %lu\n", @@ -250,17 +263,24 @@ EXPORT_SYMBOL_GPL(em_cpu_get); * @cpus : Pointer to cpumask_t, which in case of a CPU device is * obligatory. It can be taken from i.e. 'policy->cpus'. For other * type of devices this should be set to NULL. + * @milliwatts : Flag indicating that the power values are in milliWatts or + * in some other scale. It must be set properly. * * Create Energy Model tables for a performance domain using the callbacks * defined in cb. * + * The @milliwatts is important to set with correct value. Some kernel + * sub-systems might rely on this flag and check if all devices in the EM are + * using the same scale. + * * If multiple clients register the same performance domain, all but the first * registration will be ignored. * * Return 0 on success */ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, - struct em_data_callback *cb, cpumask_t *cpus) + struct em_data_callback *cb, cpumask_t *cpus, + bool milliwatts) { unsigned long cap, prev_cap = 0; int cpu, ret; @@ -313,6 +333,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, if (ret) goto unlock; + dev->em_pd->milliwatts = milliwatts; + em_debug_create_pd(dev); dev_info(dev, "EM: created perf domain\n"); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 2fc7d509a34f..da0b41914177 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -326,7 +326,7 @@ static int create_image(int platform_mode) if (!in_suspend) { events_check_enabled = false; - clear_free_pages(); + clear_or_poison_free_pages(); } platform_leave(platform_mode); diff --git a/kernel/power/power.h b/kernel/power/power.h index 24f12d534515..778bf431ec02 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -106,7 +106,7 @@ extern int create_basic_memory_bitmaps(void); extern void free_basic_memory_bitmaps(void); extern int hibernate_preallocate_memory(void); -extern void clear_free_pages(void); +extern void clear_or_poison_free_pages(void); /** * Auxiliary structure used for reading the snapshot image data and diff --git a/kernel/power/process.c b/kernel/power/process.c index 4b6a54da7e65..45b054b7b5ec 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -146,7 +146,7 @@ int freeze_processes(void) BUG_ON(in_atomic()); /* - * Now that the whole userspace is frozen we need to disbale + * Now that the whole userspace is frozen we need to disable * the OOM killer to disallow any further interference with * killable tasks. There is no guarantee oom victims will * ever reach a point they go away we have to wait with a timeout. diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 46b1804c1ddf..d63560e1cf87 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -76,6 +76,40 @@ static inline void hibernate_restore_protect_page(void *page_address) {} static inline void hibernate_restore_unprotect_page(void *page_address) {} #endif /* CONFIG_STRICT_KERNEL_RWX && CONFIG_ARCH_HAS_SET_MEMORY */ + +/* + * The calls to set_direct_map_*() should not fail because remapping a page + * here means that we only update protection bits in an existing PTE. + * It is still worth to have a warning here if something changes and this + * will no longer be the case. + */ +static inline void hibernate_map_page(struct page *page) +{ + if (IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) { + int ret = set_direct_map_default_noflush(page); + + if (ret) + pr_warn_once("Failed to remap page\n"); + } else { + debug_pagealloc_map_pages(page, 1); + } +} + +static inline void hibernate_unmap_page(struct page *page) +{ + if (IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) { + unsigned long addr = (unsigned long)page_address(page); + int ret = set_direct_map_invalid_noflush(page); + + if (ret) + pr_warn_once("Failed to remap page\n"); + + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + } else { + debug_pagealloc_unmap_pages(page, 1); + } +} + static int swsusp_page_is_free(struct page *); static void swsusp_set_page_forbidden(struct page *); static void swsusp_unset_page_forbidden(struct page *); @@ -1144,7 +1178,15 @@ void free_basic_memory_bitmaps(void) pr_debug("Basic memory bitmaps freed\n"); } -void clear_free_pages(void) +static void clear_or_poison_free_page(struct page *page) +{ + if (page_poisoning_enabled_static()) + __kernel_poison_pages(page, 1); + else if (want_init_on_free()) + clear_highpage(page); +} + +void clear_or_poison_free_pages(void) { struct memory_bitmap *bm = free_pages_map; unsigned long pfn; @@ -1152,12 +1194,12 @@ void clear_free_pages(void) if (WARN_ON(!(free_pages_map))) return; - if (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) || want_init_on_free()) { + if (page_poisoning_enabled() || want_init_on_free()) { memory_bm_position_reset(bm); pfn = memory_bm_next_pfn(bm); while (pfn != BM_END_OF_MAP) { if (pfn_valid(pfn)) - clear_highpage(pfn_to_page(pfn)); + clear_or_poison_free_page(pfn_to_page(pfn)); pfn = memory_bm_next_pfn(bm); } @@ -1355,9 +1397,9 @@ static void safe_copy_page(void *dst, struct page *s_page) if (kernel_page_present(s_page)) { do_copy_page(dst, page_address(s_page)); } else { - kernel_map_pages(s_page, 1, 1); + hibernate_map_page(s_page); do_copy_page(dst, page_address(s_page)); - kernel_map_pages(s_page, 1, 0); + hibernate_unmap_page(s_page); } } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 32391acc806b..d8cae434f9eb 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -224,6 +224,7 @@ EXPORT_SYMBOL_GPL(suspend_set_ops); /** * suspend_valid_only_mem - Generic memory-only valid callback. + * @state: Target system sleep state. * * Platform drivers that implement mem suspend only and only need to check for * that in their .valid() callback can use this instead of rolling their own @@ -335,6 +336,7 @@ static int suspend_test(int level) /** * suspend_prepare - Prepare for entering system sleep state. + * @state: Target system sleep state. * * Common code run for every system sleep state that can be entered (except for * hibernation). Run suspend notifiers, allocate the "suspend" console and diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fe64a49344bf..28713dda3f6b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -528,8 +528,8 @@ static int log_store(u32 caller_id, int facility, int level, if (dev_info) memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); - /* insert message */ - if ((flags & LOG_CONT) || !(flags & LOG_NEWLINE)) + /* A message without a trailing newline can be continued. */ + if (!(flags & LOG_NEWLINE)) prb_commit(&e); else prb_final_commit(&e); @@ -3025,10 +3025,8 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) wake_up_interruptible(&log_wait); } -static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { - .func = wake_up_klogd_work_func, - .flags = ATOMIC_INIT(IRQ_WORK_LAZY), -}; +static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = + IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func); void wake_up_klogd(void) { diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 24a960a89aa8..74e25a1704f2 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -345,7 +345,7 @@ DESC_ID((id) - DESCS_COUNT(desc_ring)) */ struct prb_data_block { unsigned long id; - char data[0]; + char data[]; }; /* @@ -882,8 +882,6 @@ static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out) head_id = atomic_long_read(&desc_ring->head_id); /* LMM(desc_reserve:A) */ do { - desc = to_desc(desc_ring, head_id); - id = DESC_ID(head_id + 1); id_prev_wrap = DESC_ID_PREV_WRAP(desc_ring, id); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 43d6179508d6..61db50f7ca86 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -57,7 +57,7 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, return 0; } - ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); + ret = __access_remote_vm(mm, addr, buf, len, gup_flags); mmput(mm); return ret; @@ -117,9 +117,9 @@ void __ptrace_unlink(struct task_struct *child) const struct cred *old_cred; BUG_ON(!child->ptrace); - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); -#ifdef TIF_SYSCALL_EMU - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); + clear_task_syscall_work(child, SYSCALL_TRACE); +#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU) + clear_task_syscall_work(child, SYSCALL_EMU); #endif child->parent = child->real_parent; @@ -264,17 +264,11 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) return ret; } -static bool ptrace_has_cap(const struct cred *cred, struct user_namespace *ns, - unsigned int mode) +static bool ptrace_has_cap(struct user_namespace *ns, unsigned int mode) { - int ret; - if (mode & PTRACE_MODE_NOAUDIT) - ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NOAUDIT); - else - ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NONE); - - return ret == 0; + return ns_capable_noaudit(ns, CAP_SYS_PTRACE); + return ns_capable(ns, CAP_SYS_PTRACE); } /* Returns 0 on success, -errno on denial. */ @@ -326,7 +320,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) gid_eq(caller_gid, tcred->sgid) && gid_eq(caller_gid, tcred->gid)) goto ok; - if (ptrace_has_cap(cred, tcred->user_ns, mode)) + if (ptrace_has_cap(tcred->user_ns, mode)) goto ok; rcu_read_unlock(); return -EPERM; @@ -345,7 +339,7 @@ ok: mm = task->mm; if (mm && ((get_dumpable(mm) != SUID_DUMP_USER) && - !ptrace_has_cap(cred, mm->user_ns, mode))) + !ptrace_has_cap(mm->user_ns, mode))) return -EPERM; return security_ptrace_access_check(task, mode); @@ -812,15 +806,15 @@ static int ptrace_resume(struct task_struct *child, long request, return -EIO; if (request == PTRACE_SYSCALL) - set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + set_task_syscall_work(child, SYSCALL_TRACE); else - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + clear_task_syscall_work(child, SYSCALL_TRACE); -#ifdef TIF_SYSCALL_EMU +#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU) if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP) - set_tsk_thread_flag(child, TIF_SYSCALL_EMU); + set_task_syscall_work(child, SYSCALL_EMU); else - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); + clear_task_syscall_work(child, SYSCALL_EMU); #endif if (is_singleblock(request)) { diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index b71e21f73c40..cdc57b4f6d48 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -221,19 +221,23 @@ config RCU_NOCB_CPU Use this option to reduce OS jitter for aggressive HPC or real-time workloads. It can also be used to offload RCU callback invocation to energy-efficient CPUs in battery-powered - asymmetric multiprocessors. + asymmetric multiprocessors. The price of this reduced jitter + is that the overhead of call_rcu() increases and that some + workloads will incur significant increases in context-switch + rates. This option offloads callback invocation from the set of CPUs specified at boot time by the rcu_nocbs parameter. For each such CPU, a kthread ("rcuox/N") will be created to invoke callbacks, where the "N" is the CPU being offloaded, and where - the "p" for RCU-preempt (PREEMPTION kernels) and "s" for RCU-sched - (!PREEMPTION kernels). Nothing prevents this kthread from running - on the specified CPUs, but (1) the kthreads may be preempted - between each callback, and (2) affinity or cgroups can be used - to force the kthreads to run on whatever set of CPUs is desired. - - Say Y here if you want to help to debug reduced OS jitter. + the "x" is "p" for RCU-preempt (PREEMPTION kernels) and "s" for + RCU-sched (!PREEMPTION kernels). Nothing prevents this kthread + from running on the specified CPUs, but (1) the kthreads may be + preempted between each callback, and (2) affinity or cgroups can + be used to force the kthreads to run on whatever set of CPUs is + desired. + + Say Y here if you need reduced OS jitter, despite added overhead. Say N here if you are unsure. config TASKS_TRACE_RCU_READ_MB diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index e01cba5e4b52..59ef1ae6dc37 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -533,4 +533,20 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return false; } static inline void rcu_bind_current_to_nocb(void) { } #endif +#if !defined(CONFIG_TINY_RCU) && defined(CONFIG_TASKS_RCU) +void show_rcu_tasks_classic_gp_kthread(void); +#else +static inline void show_rcu_tasks_classic_gp_kthread(void) {} +#endif +#if !defined(CONFIG_TINY_RCU) && defined(CONFIG_TASKS_RUDE_RCU) +void show_rcu_tasks_rude_gp_kthread(void); +#else +static inline void show_rcu_tasks_rude_gp_kthread(void) {} +#endif +#if !defined(CONFIG_TINY_RCU) && defined(CONFIG_TASKS_TRACE_RCU) +void show_rcu_tasks_trace_gp_kthread(void); +#else +static inline void show_rcu_tasks_trace_gp_kthread(void) {} +#endif + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 5c293afc07b8..492262bcb591 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -62,7 +62,7 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) /* Is the specified rcu_segcblist offloaded? */ static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) { - return rsclp->offloaded; + return IS_ENABLED(CONFIG_RCU_NOCB_CPU) && rsclp->offloaded; } /* diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 2819b95479af..06491d5530db 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -38,6 +38,7 @@ #include <asm/byteorder.h> #include <linux/torture.h> #include <linux/vmalloc.h> +#include <linux/rcupdate_trace.h> #include "rcu.h" @@ -294,6 +295,35 @@ static struct rcu_scale_ops tasks_ops = { .name = "tasks" }; +/* + * Definitions for RCU-tasks-trace scalability testing. + */ + +static int tasks_trace_scale_read_lock(void) +{ + rcu_read_lock_trace(); + return 0; +} + +static void tasks_trace_scale_read_unlock(int idx) +{ + rcu_read_unlock_trace(); +} + +static struct rcu_scale_ops tasks_tracing_ops = { + .ptype = RCU_TASKS_FLAVOR, + .init = rcu_sync_scale_init, + .readlock = tasks_trace_scale_read_lock, + .readunlock = tasks_trace_scale_read_unlock, + .get_gp_seq = rcu_no_completed, + .gp_diff = rcu_seq_diff, + .async = call_rcu_tasks_trace, + .gp_barrier = rcu_barrier_tasks_trace, + .sync = synchronize_rcu_tasks_trace, + .exp_sync = synchronize_rcu_tasks_trace, + .name = "tasks-tracing" +}; + static unsigned long rcuscale_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -754,7 +784,7 @@ rcu_scale_init(void) long i; int firsterr = 0; static struct rcu_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops, + &rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops, &tasks_tracing_ops }; if (!torture_init_begin(scale_type, verbose)) @@ -772,7 +802,6 @@ rcu_scale_init(void) for (i = 0; i < ARRAY_SIZE(scale_ops); i++) pr_cont(" %s", scale_ops[i]->name); pr_cont("\n"); - WARN_ON(!IS_MODULE(CONFIG_RCU_SCALE_TEST)); firsterr = -EINVAL; cur_ops = NULL; goto unwind; @@ -846,6 +875,10 @@ rcu_scale_init(void) unwind: torture_init_end(); rcu_scale_cleanup(); + if (shutdown) { + WARN_ON(!IS_MODULE(CONFIG_RCU_SCALE_TEST)); + kernel_power_off(); + } return firsterr; } diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 916ea4f66e4b..528ed10b78fd 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -317,6 +317,7 @@ struct rcu_torture_ops { void (*cb_barrier)(void); void (*fqs)(void); void (*stats)(void); + void (*gp_kthread_dbg)(void); int (*stall_dur)(void); int irq_capable; int can_boost; @@ -466,6 +467,7 @@ static struct rcu_torture_ops rcu_ops = { .cb_barrier = rcu_barrier, .fqs = rcu_force_quiescent_state, .stats = NULL, + .gp_kthread_dbg = show_rcu_gp_kthreads, .stall_dur = rcu_jiffies_till_stall_check, .irq_capable = 1, .can_boost = rcu_can_boost(), @@ -693,6 +695,7 @@ static struct rcu_torture_ops tasks_ops = { .exp_sync = synchronize_rcu_mult_test, .call = call_rcu_tasks, .cb_barrier = rcu_barrier_tasks, + .gp_kthread_dbg = show_rcu_tasks_classic_gp_kthread, .fqs = NULL, .stats = NULL, .irq_capable = 1, @@ -762,6 +765,7 @@ static struct rcu_torture_ops tasks_rude_ops = { .exp_sync = synchronize_rcu_tasks_rude, .call = call_rcu_tasks_rude, .cb_barrier = rcu_barrier_tasks_rude, + .gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread, .fqs = NULL, .stats = NULL, .irq_capable = 1, @@ -800,6 +804,7 @@ static struct rcu_torture_ops tasks_tracing_ops = { .exp_sync = synchronize_rcu_tasks_trace, .call = call_rcu_tasks_trace, .cb_barrier = rcu_barrier_tasks_trace, + .gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread, .fqs = NULL, .stats = NULL, .irq_capable = 1, @@ -912,7 +917,8 @@ static int rcu_torture_boost(void *arg) oldstarttime = boost_starttime; while (time_before(jiffies, oldstarttime)) { schedule_timeout_interruptible(oldstarttime - jiffies); - stutter_wait("rcu_torture_boost"); + if (stutter_wait("rcu_torture_boost")) + sched_set_fifo_low(current); if (torture_must_stop()) goto checkwait; } @@ -932,7 +938,8 @@ static int rcu_torture_boost(void *arg) jiffies); call_rcu_time = jiffies; } - stutter_wait("rcu_torture_boost"); + if (stutter_wait("rcu_torture_boost")) + sched_set_fifo_low(current); if (torture_must_stop()) goto checkwait; } @@ -964,7 +971,8 @@ static int rcu_torture_boost(void *arg) } /* Go do the stutter. */ -checkwait: stutter_wait("rcu_torture_boost"); +checkwait: if (stutter_wait("rcu_torture_boost")) + sched_set_fifo_low(current); } while (!torture_must_stop()); /* Clean up and exit. */ @@ -987,6 +995,7 @@ rcu_torture_fqs(void *arg) { unsigned long fqs_resume_time; int fqs_burst_remaining; + int oldnice = task_nice(current); VERBOSE_TOROUT_STRING("rcu_torture_fqs task started"); do { @@ -1002,7 +1011,8 @@ rcu_torture_fqs(void *arg) udelay(fqs_holdoff); fqs_burst_remaining -= fqs_holdoff; } - stutter_wait("rcu_torture_fqs"); + if (stutter_wait("rcu_torture_fqs")) + sched_set_normal(current, oldnice); } while (!torture_must_stop()); torture_kthread_stopping("rcu_torture_fqs"); return 0; @@ -1022,9 +1032,11 @@ rcu_torture_writer(void *arg) bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; bool gp_sync1 = gp_sync; int i; + int oldnice = task_nice(current); struct rcu_torture *rp; struct rcu_torture *old_rp; static DEFINE_TORTURE_RANDOM(rand); + bool stutter_waited; int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, RTWS_COND_GET, RTWS_SYNC }; int nsynctypes = 0; @@ -1143,7 +1155,8 @@ rcu_torture_writer(void *arg) !rcu_gp_is_normal(); } rcu_torture_writer_state = RTWS_STUTTER; - if (stutter_wait("rcu_torture_writer") && + stutter_waited = stutter_wait("rcu_torture_writer"); + if (stutter_waited && !READ_ONCE(rcu_fwd_cb_nodelay) && !cur_ops->slow_gps && !torture_must_stop() && @@ -1155,6 +1168,8 @@ rcu_torture_writer(void *arg) rcu_ftrace_dump(DUMP_ALL); WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); } + if (stutter_waited) + sched_set_normal(current, oldnice); } while (!torture_must_stop()); rcu_torture_current = NULL; // Let stats task know that we are done. /* Reset expediting back to unexpedited. */ @@ -1594,7 +1609,8 @@ rcu_torture_stats_print(void) sched_show_task(wtp); splatted = true; } - show_rcu_gp_kthreads(); + if (cur_ops->gp_kthread_dbg) + cur_ops->gp_kthread_dbg(); rcu_ftrace_dump(DUMP_ALL); } rtcv_snap = rcu_torture_current_version; @@ -1913,7 +1929,9 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp, unsigned long stopat; static DEFINE_TORTURE_RANDOM(trs); - if (cur_ops->call && cur_ops->sync && cur_ops->cb_barrier) { + if (!cur_ops->sync) + return; // Cannot do need_resched() forward progress testing without ->sync. + if (cur_ops->call && cur_ops->cb_barrier) { init_rcu_head_on_stack(&fcs.rh); selfpropcb = true; } @@ -2103,6 +2121,7 @@ static struct notifier_block rcutorture_oom_nb = { /* Carry out grace-period forward-progress testing. */ static int rcu_torture_fwd_prog(void *args) { + int oldnice = task_nice(current); struct rcu_fwd *rfp = args; int tested = 0; int tested_tries = 0; @@ -2121,7 +2140,8 @@ static int rcu_torture_fwd_prog(void *args) rcu_torture_fwd_prog_cr(rfp); /* Avoid slow periods, better to test when busy. */ - stutter_wait("rcu_torture_fwd_prog"); + if (stutter_wait("rcu_torture_fwd_prog")) + sched_set_normal(current, oldnice); } while (!torture_must_stop()); /* Short runs might not contain a valid forward-progress attempt. */ WARN_ON(!tested && tested_tries >= 5); @@ -2137,8 +2157,8 @@ static int __init rcu_torture_fwd_prog_init(void) if (!fwd_progress) return 0; /* Not requested, so don't do it. */ - if (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0 || - cur_ops == &rcu_busted_ops) { + if ((!cur_ops->sync && !cur_ops->call) || + !cur_ops->stall_dur || cur_ops->stall_dur() <= 0 || cur_ops == &rcu_busted_ops) { VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, unsupported by RCU flavor under test"); return 0; } @@ -2472,7 +2492,8 @@ rcu_torture_cleanup(void) return; } - show_rcu_gp_kthreads(); + if (cur_ops->gp_kthread_dbg) + cur_ops->gp_kthread_dbg(); rcu_torture_read_exit_cleanup(); rcu_torture_barrier_cleanup(); rcu_torture_fwd_prog_cleanup(); @@ -2484,13 +2505,13 @@ rcu_torture_cleanup(void) torture_stop_kthread(rcu_torture_reader, reader_tasks[i]); kfree(reader_tasks); + reader_tasks = NULL; } if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) { + for (i = 0; i < nfakewriters; i++) torture_stop_kthread(rcu_torture_fakewriter, fakewriter_tasks[i]); - } kfree(fakewriter_tasks); fakewriter_tasks = NULL; } @@ -2647,7 +2668,6 @@ rcu_torture_init(void) for (i = 0; i < ARRAY_SIZE(torture_ops); i++) pr_cont(" %s", torture_ops[i]->name); pr_cont("\n"); - WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST)); firsterr = -EINVAL; cur_ops = NULL; goto unwind; @@ -2815,6 +2835,10 @@ rcu_torture_init(void) unwind: torture_init_end(); rcu_torture_cleanup(); + if (shutdown_secs) { + WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST)); + kernel_power_off(); + } return firsterr; } diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 952595c678b3..23ff36a66f97 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -658,7 +658,6 @@ ref_scale_init(void) for (i = 0; i < ARRAY_SIZE(scale_ops); i++) pr_cont(" %s", scale_ops[i]->name); pr_cont("\n"); - WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST)); firsterr = -EINVAL; cur_ops = NULL; goto unwind; @@ -681,6 +680,12 @@ ref_scale_init(void) // Reader tasks (default to ~75% of online CPUs). if (nreaders < 0) nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2); + if (WARN_ONCE(loops <= 0, "%s: loops = %ld, adjusted to 1\n", __func__, loops)) + loops = 1; + if (WARN_ONCE(nreaders <= 0, "%s: nreaders = %d, adjusted to 1\n", __func__, nreaders)) + nreaders = 1; + if (WARN_ONCE(nruns <= 0, "%s: nruns = %d, adjusted to 1\n", __func__, nruns)) + nruns = 1; reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]), GFP_KERNEL); if (!reader_tasks) { @@ -712,6 +717,10 @@ ref_scale_init(void) unwind: torture_init_end(); ref_scale_cleanup(); + if (shutdown) { + WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST)); + kernel_power_off(); + } return firsterr; } diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index c13348ee80a5..0f23d20d485a 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -177,11 +177,13 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) INIT_DELAYED_WORK(&ssp->work, process_srcu); if (!is_static) ssp->sda = alloc_percpu(struct srcu_data); + if (!ssp->sda) + return -ENOMEM; init_srcu_struct_nodes(ssp, is_static); ssp->srcu_gp_seq_needed_exp = 0; ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */ - return ssp->sda ? 0 : -ENOMEM; + return 0; } #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -906,7 +908,7 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm) { struct rcu_synchronize rcu; - RCU_LOCKDEP_WARN(lock_is_held(&ssp->dep_map) || + RCU_LOCKDEP_WARN(lockdep_is_held(ssp) || lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d5d9f2d03e8a..35bdcfd84d42 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -290,7 +290,7 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) ".C"[!!data_race(rtp->cbs_head)], s); } -#endif /* #ifndef CONFIG_TINY_RCU */ +#endif // #ifndef CONFIG_TINY_RCU static void exit_tasks_rcu_finish_trace(struct task_struct *t); @@ -335,23 +335,18 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) // Start off with initial wait and slowly back off to 1 HZ wait. fract = rtp->init_fract; - if (fract > HZ) - fract = HZ; - for (;;) { + while (!list_empty(&holdouts)) { bool firstreport; bool needreport; int rtst; - if (list_empty(&holdouts)) - break; - /* Slowly back off waiting for holdouts */ set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS); - schedule_timeout_idle(HZ/fract); + schedule_timeout_idle(fract); - if (fract > 1) - fract--; + if (fract < HZ) + fract++; rtst = READ_ONCE(rcu_task_stall_timeout); needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); @@ -560,7 +555,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_tasks); static int __init rcu_spawn_tasks_kthread(void) { rcu_tasks.gp_sleep = HZ / 10; - rcu_tasks.init_fract = 10; + rcu_tasks.init_fract = HZ / 10; rcu_tasks.pregp_func = rcu_tasks_pregp_step; rcu_tasks.pertask_func = rcu_tasks_pertask; rcu_tasks.postscan_func = rcu_tasks_postscan; @@ -571,12 +566,13 @@ static int __init rcu_spawn_tasks_kthread(void) } core_initcall(rcu_spawn_tasks_kthread); -#ifndef CONFIG_TINY_RCU -static void show_rcu_tasks_classic_gp_kthread(void) +#if !defined(CONFIG_TINY_RCU) +void show_rcu_tasks_classic_gp_kthread(void) { show_rcu_tasks_generic_gp_kthread(&rcu_tasks, ""); } -#endif /* #ifndef CONFIG_TINY_RCU */ +EXPORT_SYMBOL_GPL(show_rcu_tasks_classic_gp_kthread); +#endif // !defined(CONFIG_TINY_RCU) /* Do the srcu_read_lock() for the above synchronize_srcu(). */ void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) @@ -598,7 +594,6 @@ void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) } #else /* #ifdef CONFIG_TASKS_RCU */ -static inline void show_rcu_tasks_classic_gp_kthread(void) { } void exit_tasks_rcu_start(void) { } void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } #endif /* #else #ifdef CONFIG_TASKS_RCU */ @@ -699,16 +694,14 @@ static int __init rcu_spawn_tasks_rude_kthread(void) } core_initcall(rcu_spawn_tasks_rude_kthread); -#ifndef CONFIG_TINY_RCU -static void show_rcu_tasks_rude_gp_kthread(void) +#if !defined(CONFIG_TINY_RCU) +void show_rcu_tasks_rude_gp_kthread(void) { show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, ""); } -#endif /* #ifndef CONFIG_TINY_RCU */ - -#else /* #ifdef CONFIG_TASKS_RUDE_RCU */ -static void show_rcu_tasks_rude_gp_kthread(void) {} -#endif /* #else #ifdef CONFIG_TASKS_RUDE_RCU */ +EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread); +#endif // !defined(CONFIG_TINY_RCU) +#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ //////////////////////////////////////////////////////////////////////// // @@ -1183,12 +1176,12 @@ static int __init rcu_spawn_tasks_trace_kthread(void) { if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) { rcu_tasks_trace.gp_sleep = HZ / 10; - rcu_tasks_trace.init_fract = 10; + rcu_tasks_trace.init_fract = HZ / 10; } else { rcu_tasks_trace.gp_sleep = HZ / 200; if (rcu_tasks_trace.gp_sleep <= 0) rcu_tasks_trace.gp_sleep = 1; - rcu_tasks_trace.init_fract = HZ / 5; + rcu_tasks_trace.init_fract = HZ / 200; if (rcu_tasks_trace.init_fract <= 0) rcu_tasks_trace.init_fract = 1; } @@ -1202,8 +1195,8 @@ static int __init rcu_spawn_tasks_trace_kthread(void) } core_initcall(rcu_spawn_tasks_trace_kthread); -#ifndef CONFIG_TINY_RCU -static void show_rcu_tasks_trace_gp_kthread(void) +#if !defined(CONFIG_TINY_RCU) +void show_rcu_tasks_trace_gp_kthread(void) { char buf[64]; @@ -1213,11 +1206,11 @@ static void show_rcu_tasks_trace_gp_kthread(void) data_race(n_heavy_reader_attempts)); show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf); } -#endif /* #ifndef CONFIG_TINY_RCU */ +EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread); +#endif // !defined(CONFIG_TINY_RCU) #else /* #ifdef CONFIG_TASKS_TRACE_RCU */ static void exit_tasks_rcu_finish_trace(struct task_struct *t) { } -static inline void show_rcu_tasks_trace_gp_kthread(void) {} #endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ #ifndef CONFIG_TINY_RCU diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 06895ef85d69..40e5e3dd253e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -177,7 +177,7 @@ module_param(rcu_unlock_delay, int, 0444); * per-CPU. Object size is equal to one page. This value * can be changed at boot time. */ -static int rcu_min_cached_objs = 2; +static int rcu_min_cached_objs = 5; module_param(rcu_min_cached_objs, int, 0444); /* Retrieve RCU kthreads priority for rcutorture */ @@ -341,6 +341,14 @@ static bool rcu_dynticks_in_eqs(int snap) return !(snap & RCU_DYNTICK_CTRL_CTR); } +/* Return true if the specified CPU is currently idle from an RCU viewpoint. */ +bool rcu_is_idle_cpu(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + return rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)); +} + /* * Return true if the CPU corresponding to the specified rcu_data * structure has spent some time in an extended quiescent state since @@ -409,7 +417,7 @@ bool rcu_eqs_special_set(int cpu) * * The caller must have disabled interrupts and must not be idle. */ -void rcu_momentary_dyntick_idle(void) +notrace void rcu_momentary_dyntick_idle(void) { int special; @@ -546,12 +554,12 @@ static int param_set_next_fqs_jiffies(const char *val, const struct kernel_param return ret; } -static struct kernel_param_ops first_fqs_jiffies_ops = { +static const struct kernel_param_ops first_fqs_jiffies_ops = { .set = param_set_first_fqs_jiffies, .get = param_get_ulong, }; -static struct kernel_param_ops next_fqs_jiffies_ops = { +static const struct kernel_param_ops next_fqs_jiffies_ops = { .set = param_set_next_fqs_jiffies, .get = param_get_ulong, }; @@ -928,8 +936,8 @@ void __rcu_irq_enter_check_tick(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - // Enabling the tick is unsafe in NMI handlers. - if (WARN_ON_ONCE(in_nmi())) + // If we're here from NMI there's nothing to do. + if (in_nmi()) return; RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), @@ -1093,8 +1101,11 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) * CPU can safely enter RCU read-side critical sections. In other words, * if the current CPU is not in its idle loop or is in an interrupt or * NMI handler, return true. + * + * Make notrace because it can be called by the internal functions of + * ftrace, and making this notrace removes unnecessary recursion calls. */ -bool rcu_is_watching(void) +notrace bool rcu_is_watching(void) { bool ret; @@ -1149,7 +1160,7 @@ bool rcu_lockdep_current_cpu_online(void) preempt_disable_notrace(); rdp = this_cpu_ptr(&rcu_data); rnp = rdp->mynode; - if (rdp->grpmask & rcu_rnp_online_cpus(rnp)) + if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || READ_ONCE(rnp->ofl_seq) & 0x1) ret = true; preempt_enable_notrace(); return ret; @@ -1311,8 +1322,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) if (IS_ENABLED(CONFIG_IRQ_WORK) && !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq && (rnp->ffmask & rdp->grpmask)) { - init_irq_work(&rdp->rcu_iw, rcu_iw_handler); - atomic_set(&rdp->rcu_iw.flags, IRQ_WORK_HARD_IRQ); rdp->rcu_iw_pending = true; rdp->rcu_iw_gp_seq = rnp->gp_seq; irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); @@ -1603,8 +1612,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) { bool ret = false; bool need_qs; - const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - rcu_segcblist_is_offloaded(&rdp->cblist); + const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); raw_lockdep_assert_held_rcu_node(rnp); @@ -1715,6 +1723,7 @@ static void rcu_strict_gp_boundary(void *unused) */ static bool rcu_gp_init(void) { + unsigned long firstseq; unsigned long flags; unsigned long oldmask; unsigned long mask; @@ -1758,6 +1767,12 @@ static bool rcu_gp_init(void) */ rcu_state.gp_state = RCU_GP_ONOFF; rcu_for_each_leaf_node(rnp) { + smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values. + firstseq = READ_ONCE(rnp->ofl_seq); + if (firstseq & 0x1) + while (firstseq == READ_ONCE(rnp->ofl_seq)) + schedule_timeout_idle(1); // Can't wake unless RCU is watching. + smp_mb(); // Pair with barriers used when updating ->ofl_seq to even values. raw_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_irq_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && @@ -2048,8 +2063,7 @@ static void rcu_gp_cleanup(void) needgp = true; } /* Advance CBs to reduce false positives below. */ - offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - rcu_segcblist_is_offloaded(&rdp->cblist); + offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) { WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT); WRITE_ONCE(rcu_state.gp_req_activity, jiffies); @@ -2248,8 +2262,7 @@ rcu_report_qs_rdp(struct rcu_data *rdp) unsigned long flags; unsigned long mask; bool needwake = false; - const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - rcu_segcblist_is_offloaded(&rdp->cblist); + const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); struct rcu_node *rnp; WARN_ON_ONCE(rdp->cpu != smp_processor_id()); @@ -2399,6 +2412,7 @@ int rcutree_dead_cpu(unsigned int cpu) if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) return 0; + WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); /* Adjust any no-longer-needed kthreads. */ rcu_boost_kthread_setaffinity(rnp, -1); /* Do any needed no-CB deferred wakeups from this CPU. */ @@ -2417,8 +2431,7 @@ static void rcu_do_batch(struct rcu_data *rdp) { int div; unsigned long flags; - const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - rcu_segcblist_is_offloaded(&rdp->cblist); + const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); struct rcu_head *rhp; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); long bl, count; @@ -2675,8 +2688,7 @@ static __latent_entropy void rcu_core(void) unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - rcu_segcblist_is_offloaded(&rdp->cblist); + const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); if (cpu_is_offline(smp_processor_id())) return; @@ -2978,8 +2990,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) rcu_segcblist_n_cbs(&rdp->cblist)); /* Go handle any RCU core processing required. */ - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) { + if (unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) { __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */ } else { __call_rcu_core(rdp, head, flags); @@ -3084,6 +3095,9 @@ struct kfree_rcu_cpu_work { * In order to save some per-cpu space the list is singular. * Even though it is lockless an access has to be protected by the * per-cpu lock. + * @page_cache_work: A work to refill the cache when it is empty + * @work_in_progress: Indicates that page_cache_work is running + * @hrtimer: A hrtimer for scheduling a page_cache_work * @nr_bkv_objs: number of allocated objects at @bkvcache. * * This is a per-CPU structure. The reason that it is not included in @@ -3100,6 +3114,11 @@ struct kfree_rcu_cpu { bool monitor_todo; bool initialized; int count; + + struct work_struct page_cache_work; + atomic_t work_in_progress; + struct hrtimer hrtimer; + struct llist_head bkvcache; int nr_bkv_objs; }; @@ -3217,10 +3236,10 @@ static void kfree_rcu_work(struct work_struct *work) } rcu_lock_release(&rcu_callback_map); - krcp = krc_this_cpu_lock(&flags); + raw_spin_lock_irqsave(&krcp->lock, flags); if (put_cached_bnode(krcp, bkvhead[i])) bkvhead[i] = NULL; - krc_this_cpu_unlock(krcp, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); if (bkvhead[i]) free_page((unsigned long) bkvhead[i]); @@ -3347,6 +3366,57 @@ static void kfree_rcu_monitor(struct work_struct *work) raw_spin_unlock_irqrestore(&krcp->lock, flags); } +static enum hrtimer_restart +schedule_page_work_fn(struct hrtimer *t) +{ + struct kfree_rcu_cpu *krcp = + container_of(t, struct kfree_rcu_cpu, hrtimer); + + queue_work(system_highpri_wq, &krcp->page_cache_work); + return HRTIMER_NORESTART; +} + +static void fill_page_cache_func(struct work_struct *work) +{ + struct kvfree_rcu_bulk_data *bnode; + struct kfree_rcu_cpu *krcp = + container_of(work, struct kfree_rcu_cpu, + page_cache_work); + unsigned long flags; + bool pushed; + int i; + + for (i = 0; i < rcu_min_cached_objs; i++) { + bnode = (struct kvfree_rcu_bulk_data *) + __get_free_page(GFP_KERNEL | __GFP_NOWARN); + + if (bnode) { + raw_spin_lock_irqsave(&krcp->lock, flags); + pushed = put_cached_bnode(krcp, bnode); + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + if (!pushed) { + free_page((unsigned long) bnode); + break; + } + } + } + + atomic_set(&krcp->work_in_progress, 0); +} + +static void +run_page_cache_worker(struct kfree_rcu_cpu *krcp) +{ + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && + !atomic_xchg(&krcp->work_in_progress, 1)) { + hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + krcp->hrtimer.function = schedule_page_work_fn; + hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); + } +} + static inline bool kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr) { @@ -3363,32 +3433,8 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr) if (!krcp->bkvhead[idx] || krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) { bnode = get_cached_bnode(krcp); - if (!bnode) { - /* - * To keep this path working on raw non-preemptible - * sections, prevent the optional entry into the - * allocator as it uses sleeping locks. In fact, even - * if the caller of kfree_rcu() is preemptible, this - * path still is not, as krcp->lock is a raw spinlock. - * With additional page pre-allocation in the works, - * hitting this return is going to be much less likely. - */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - return false; - - /* - * NOTE: For one argument of kvfree_rcu() we can - * drop the lock and get the page in sleepable - * context. That would allow to maintain an array - * for the CONFIG_PREEMPT_RT as well if no cached - * pages are available. - */ - bnode = (struct kvfree_rcu_bulk_data *) - __get_free_page(GFP_NOWAIT | __GFP_NOWARN); - } - /* Switch to emergency path. */ - if (unlikely(!bnode)) + if (!bnode) return false; /* Initialize the new block. */ @@ -3452,12 +3498,10 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) goto unlock_return; } - /* - * Under high memory pressure GFP_NOWAIT can fail, - * in that case the emergency path is maintained. - */ success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr); if (!success) { + run_page_cache_worker(krcp); + if (head == NULL) // Inline if kvfree_rcu(one_arg) call. goto unlock_return; @@ -3567,7 +3611,7 @@ void __init kfree_rcu_scheduler_running(void) * During early boot, any blocking grace-period wait automatically * implies a grace period. Later on, this is never the case for PREEMPTION. * - * Howevr, because a context switch is a grace period for !PREEMPTION, any + * However, because a context switch is a grace period for !PREEMPTION, any * blocking grace-period wait automatically implies a grace period if * there is only one CPU online at any point time during execution of * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to @@ -3583,7 +3627,20 @@ static int rcu_blocking_is_gp(void) return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE; might_sleep(); /* Check for RCU read-side critical section. */ preempt_disable(); - ret = num_online_cpus() <= 1; + /* + * If the rcu_state.n_online_cpus counter is equal to one, + * there is only one CPU, and that CPU sees all prior accesses + * made by any CPU that was online at the time of its access. + * Furthermore, if this counter is equal to one, its value cannot + * change until after the preempt_enable() below. + * + * Furthermore, if rcu_state.n_online_cpus is equal to one here, + * all later CPUs (both this one and any that come online later + * on) are guaranteed to see all accesses prior to this point + * in the code, without the need for additional memory barriers. + * Those memory barriers are provided by CPU-hotplug code. + */ + ret = READ_ONCE(rcu_state.n_online_cpus) <= 1; preempt_enable(); return ret; } @@ -3628,7 +3685,7 @@ void synchronize_rcu(void) lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu() in RCU read-side critical section"); if (rcu_blocking_is_gp()) - return; + return; // Context allows vacuous grace periods. if (rcu_gp_is_expedited()) synchronize_rcu_expedited(); else @@ -3707,13 +3764,13 @@ static int rcu_pending(int user) return 1; /* Does this CPU have callbacks ready to invoke? */ - if (rcu_segcblist_ready_cbs(&rdp->cblist)) + if (!rcu_segcblist_is_offloaded(&rdp->cblist) && + rcu_segcblist_ready_cbs(&rdp->cblist)) return 1; /* Has RCU gone idle with this CPU needing another grace period? */ if (!gp_in_progress && rcu_segcblist_is_enabled(&rdp->cblist) && - (!IS_ENABLED(CONFIG_RCU_NOCB_CPU) || - !rcu_segcblist_is_offloaded(&rdp->cblist)) && + !rcu_segcblist_is_offloaded(&rdp->cblist) && !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) return 1; @@ -3964,11 +4021,13 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->cpu_no_qs.b.norm = true; rdp->core_needs_qs = false; rdp->rcu_iw_pending = false; + rdp->rcu_iw = IRQ_WORK_INIT_HARD(rcu_iw_handler); rdp->rcu_iw_gp_seq = rdp->gp_seq - 1; trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcu_prepare_kthreads(cpu); rcu_spawn_cpu_nocb_kthread(cpu); + WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1); return 0; } @@ -4057,6 +4116,9 @@ void rcu_cpu_starting(unsigned int cpu) rnp = rdp->mynode; mask = rdp->grpmask; + WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); + WARN_ON_ONCE(!(rnp->ofl_seq & 0x1)); + smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). raw_spin_lock_irqsave_rcu_node(rnp, flags); WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); newcpu = !(rnp->expmaskinitnext & mask); @@ -4067,17 +4129,21 @@ void rcu_cpu_starting(unsigned int cpu) rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags); - if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */ + + /* An incoming CPU should never be blocking a grace period. */ + if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */ rcu_disable_urgency_upon_qs(rdp); /* Report QS -after- changing ->qsmaskinitnext! */ rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); } else { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } + smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). + WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); + WARN_ON_ONCE(rnp->ofl_seq & 0x1); smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } -#ifdef CONFIG_HOTPLUG_CPU /* * The outgoing function has no further need of RCU, so remove it from * the rcu_node tree's ->qsmaskinitnext bit masks. @@ -4101,6 +4167,9 @@ void rcu_report_dead(unsigned int cpu) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; + WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); + WARN_ON_ONCE(!(rnp->ofl_seq & 0x1)); + smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). raw_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq); @@ -4113,10 +4182,14 @@ void rcu_report_dead(unsigned int cpu) WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); raw_spin_unlock(&rcu_state.ofl_lock); + smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). + WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); + WARN_ON_ONCE(rnp->ofl_seq & 0x1); rdp->cpu_started = false; } +#ifdef CONFIG_HOTPLUG_CPU /* * The outgoing CPU has just passed through the dying-idle state, and we * are being invoked from the CPU that was IPIed to continue the offline @@ -4449,24 +4522,14 @@ static void __init kfree_rcu_batch_init(void) for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - struct kvfree_rcu_bulk_data *bnode; for (i = 0; i < KFREE_N_BATCHES; i++) { INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); krcp->krw_arr[i].krcp = krcp; } - for (i = 0; i < rcu_min_cached_objs; i++) { - bnode = (struct kvfree_rcu_bulk_data *) - __get_free_page(GFP_NOWAIT | __GFP_NOWARN); - - if (bnode) - put_cached_bnode(krcp, bnode); - else - pr_err("Failed to preallocate for %d CPU!\n", cpu); - } - INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); + INIT_WORK(&krcp->page_cache_work, fill_page_cache_func); krcp->initialized = true; } if (register_shrinker(&kfree_rcu_shrinker)) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e4f66b8f7c47..7708ed161f4a 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -56,6 +56,7 @@ struct rcu_node { /* Initialized from ->qsmaskinitnext at the */ /* beginning of each grace period. */ unsigned long qsmaskinitnext; + unsigned long ofl_seq; /* CPU-hotplug operation sequence count. */ /* Online CPUs for next grace period. */ unsigned long expmask; /* CPUs or groups that need to check in */ /* to allow the current expedited GP */ @@ -298,6 +299,7 @@ struct rcu_state { /* Hierarchy levels (+1 to */ /* shut bogus gcc warning) */ int ncpus; /* # CPUs seen so far. */ + int n_online_cpus; /* # CPUs online for RCU. */ /* The following fields are guarded by the root rcu_node's lock. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fd8a52e9a887..7e291ce0a1d6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -628,7 +628,7 @@ static void rcu_read_unlock_special(struct task_struct *t) set_tsk_need_resched(current); set_preempt_need_resched(); if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && - !rdp->defer_qs_iw_pending && exp) { + !rdp->defer_qs_iw_pending && exp && cpu_online(rdp->cpu)) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. init_irq_work(&rdp->defer_qs_iw, diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 0fde39b8daab..70d48c52fabc 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -13,6 +13,7 @@ /* panic() on RCU Stall sysctl. */ int sysctl_panic_on_rcu_stall __read_mostly; +int sysctl_max_rcu_stall_to_panic __read_mostly; #ifdef CONFIG_PROVE_RCU #define RCU_STALL_DELAY_DELTA (5 * HZ) @@ -106,6 +107,11 @@ early_initcall(check_cpu_stall_init); /* If so specified via sysctl, panic, yielding cleaner stall-warning output. */ static void panic_on_rcu_stall(void) { + static int cpu_stall; + + if (++cpu_stall < sysctl_max_rcu_stall_to_panic) + return; + if (sysctl_panic_on_rcu_stall) panic("RCU Stall\n"); } @@ -249,13 +255,16 @@ static bool check_slow_task(struct task_struct *t, void *arg) /* * Scan the current list of tasks blocked within RCU read-side critical - * sections, printing out the tid of each. + * sections, printing out the tid of each of the first few of them. */ -static int rcu_print_task_stall(struct rcu_node *rnp) +static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) { + int i = 0; int ndetected = 0; struct rcu_stall_chk_rdr rscr; struct task_struct *t; + struct task_struct *ts[8]; if (!rcu_preempt_blocked_readers_cgp(rnp)) return 0; @@ -264,6 +273,14 @@ static int rcu_print_task_stall(struct rcu_node *rnp) t = list_entry(rnp->gp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + get_task_struct(t); + ts[i++] = t; + if (i >= ARRAY_SIZE(ts)) + break; + } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + for (i--; i; i--) { + t = ts[i]; if (!try_invoke_on_locked_down_task(t, check_slow_task, &rscr)) pr_cont(" P%d", t->pid); else @@ -273,6 +290,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) ".q"[rscr.rs.b.need_qs], ".e"[rscr.rs.b.exp_hint], ".l"[rscr.on_blkd_list]); + put_task_struct(t); ndetected++; } pr_cont("\n"); @@ -293,8 +311,9 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections. */ -static int rcu_print_task_stall(struct rcu_node *rnp) +static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return 0; } #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ @@ -472,7 +491,6 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name); rcu_for_each_leaf_node(rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); - ndetected += rcu_print_task_stall(rnp); if (rnp->qsmask != 0) { for_each_leaf_node_possible_cpu(rnp, cpu) if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { @@ -480,7 +498,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) ndetected++; } } - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + ndetected += rcu_print_task_stall(rnp, flags); // Releases rnp->lock. } for_each_possible_cpu(cpu) diff --git a/kernel/reboot.c b/kernel/reboot.c index e7b78d5ae1ab..2a18b76ffc06 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -244,6 +244,8 @@ void migrate_to_reboot_cpu(void) void kernel_restart(char *cmd) { kernel_restart_prepare(cmd); + if (pm_power_off_prepare) + pm_power_off_prepare(); migrate_to_reboot_cpu(); syscore_shutdown(); if (!cmd) @@ -551,22 +553,22 @@ static int __init reboot_setup(char *str) break; case 's': - { - int rc; - - if (isdigit(*(str+1))) { - rc = kstrtoint(str+1, 0, &reboot_cpu); - if (rc) - return rc; - } else if (str[1] == 'm' && str[2] == 'p' && - isdigit(*(str+3))) { - rc = kstrtoint(str+3, 0, &reboot_cpu); - if (rc) - return rc; - } else + if (isdigit(*(str+1))) + reboot_cpu = simple_strtoul(str+1, NULL, 0); + else if (str[1] == 'm' && str[2] == 'p' && + isdigit(*(str+3))) + reboot_cpu = simple_strtoul(str+3, NULL, 0); + else *mode = REBOOT_SOFT; + if (reboot_cpu >= num_possible_cpus()) { + pr_err("Ignoring the CPU number in reboot= option. " + "CPU %d exceeds possible cpu number %d\n", + reboot_cpu, num_possible_cpus()); + reboot_cpu = 0; + break; + } break; - } + case 'g': *mode = REBOOT_GPIO; break; diff --git a/kernel/resource.c b/kernel/resource.c index 3ae2f56cc79d..82df80417489 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -557,13 +557,13 @@ int region_intersects(resource_size_t start, size_t size, unsigned long flags, } read_unlock(&resource_lock); - if (other == 0) - return type ? REGION_INTERSECTS : REGION_DISJOINT; + if (type == 0) + return REGION_DISJOINT; - if (type) - return REGION_MIXED; + if (other == 0) + return REGION_INTERSECTS; - return REGION_DISJOINT; + return REGION_MIXED; } EXPORT_SYMBOL_GPL(region_intersects); diff --git a/kernel/resource_kunit.c b/kernel/resource_kunit.c new file mode 100644 index 000000000000..58ab9f914602 --- /dev/null +++ b/kernel/resource_kunit.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Test cases for API provided by resource.c and ioport.h + */ + +#include <kunit/test.h> +#include <linux/ioport.h> +#include <linux/kernel.h> +#include <linux/string.h> + +#define R0_START 0x0000 +#define R0_END 0xffff +#define R1_START 0x1234 +#define R1_END 0x2345 +#define R2_START 0x4567 +#define R2_END 0x5678 +#define R3_START 0x6789 +#define R3_END 0x789a +#define R4_START 0x2000 +#define R4_END 0x7000 + +static struct resource r0 = { .start = R0_START, .end = R0_END }; +static struct resource r1 = { .start = R1_START, .end = R1_END }; +static struct resource r2 = { .start = R2_START, .end = R2_END }; +static struct resource r3 = { .start = R3_START, .end = R3_END }; +static struct resource r4 = { .start = R4_START, .end = R4_END }; + +struct result { + struct resource *r1; + struct resource *r2; + struct resource r; + bool ret; +}; + +static struct result results_for_union[] = { + { + .r1 = &r1, .r2 = &r0, .r.start = R0_START, .r.end = R0_END, .ret = true, + }, { + .r1 = &r2, .r2 = &r0, .r.start = R0_START, .r.end = R0_END, .ret = true, + }, { + .r1 = &r3, .r2 = &r0, .r.start = R0_START, .r.end = R0_END, .ret = true, + }, { + .r1 = &r4, .r2 = &r0, .r.start = R0_START, .r.end = R0_END, .ret = true, + }, { + .r1 = &r2, .r2 = &r1, .ret = false, + }, { + .r1 = &r3, .r2 = &r1, .ret = false, + }, { + .r1 = &r4, .r2 = &r1, .r.start = R1_START, .r.end = R4_END, .ret = true, + }, { + .r1 = &r2, .r2 = &r3, .ret = false, + }, { + .r1 = &r2, .r2 = &r4, .r.start = R4_START, .r.end = R4_END, .ret = true, + }, { + .r1 = &r3, .r2 = &r4, .r.start = R4_START, .r.end = R3_END, .ret = true, + }, +}; + +static struct result results_for_intersection[] = { + { + .r1 = &r1, .r2 = &r0, .r.start = R1_START, .r.end = R1_END, .ret = true, + }, { + .r1 = &r2, .r2 = &r0, .r.start = R2_START, .r.end = R2_END, .ret = true, + }, { + .r1 = &r3, .r2 = &r0, .r.start = R3_START, .r.end = R3_END, .ret = true, + }, { + .r1 = &r4, .r2 = &r0, .r.start = R4_START, .r.end = R4_END, .ret = true, + }, { + .r1 = &r2, .r2 = &r1, .ret = false, + }, { + .r1 = &r3, .r2 = &r1, .ret = false, + }, { + .r1 = &r4, .r2 = &r1, .r.start = R4_START, .r.end = R1_END, .ret = true, + }, { + .r1 = &r2, .r2 = &r3, .ret = false, + }, { + .r1 = &r2, .r2 = &r4, .r.start = R2_START, .r.end = R2_END, .ret = true, + }, { + .r1 = &r3, .r2 = &r4, .r.start = R3_START, .r.end = R4_END, .ret = true, + }, +}; + +static void resource_do_test(struct kunit *test, bool ret, struct resource *r, + bool exp_ret, struct resource *exp_r, + struct resource *r1, struct resource *r2) +{ + KUNIT_EXPECT_EQ_MSG(test, ret, exp_ret, "Resources %pR %pR", r1, r2); + KUNIT_EXPECT_EQ_MSG(test, r->start, exp_r->start, "Start elements are not equal"); + KUNIT_EXPECT_EQ_MSG(test, r->end, exp_r->end, "End elements are not equal"); +} + +static void resource_do_union_test(struct kunit *test, struct result *r) +{ + struct resource result; + bool ret; + + memset(&result, 0, sizeof(result)); + ret = resource_union(r->r1, r->r2, &result); + resource_do_test(test, ret, &result, r->ret, &r->r, r->r1, r->r2); + + memset(&result, 0, sizeof(result)); + ret = resource_union(r->r2, r->r1, &result); + resource_do_test(test, ret, &result, r->ret, &r->r, r->r2, r->r1); +} + +static void resource_test_union(struct kunit *test) +{ + struct result *r = results_for_union; + unsigned int i = 0; + + do { + resource_do_union_test(test, &r[i]); + } while (++i < ARRAY_SIZE(results_for_union)); +} + +static void resource_do_intersection_test(struct kunit *test, struct result *r) +{ + struct resource result; + bool ret; + + memset(&result, 0, sizeof(result)); + ret = resource_intersection(r->r1, r->r2, &result); + resource_do_test(test, ret, &result, r->ret, &r->r, r->r1, r->r2); + + memset(&result, 0, sizeof(result)); + ret = resource_intersection(r->r2, r->r1, &result); + resource_do_test(test, ret, &result, r->ret, &r->r, r->r2, r->r1); +} + +static void resource_test_intersection(struct kunit *test) +{ + struct result *r = results_for_intersection; + unsigned int i = 0; + + do { + resource_do_intersection_test(test, &r[i]); + } while (++i < ARRAY_SIZE(results_for_intersection)); +} + +static struct kunit_case resource_test_cases[] = { + KUNIT_CASE(resource_test_union), + KUNIT_CASE(resource_test_intersection), + {} +}; + +static struct kunit_suite resource_test_suite = { + .name = "resource", + .test_cases = resource_test_cases, +}; +kunit_test_suite(resource_test_suite); + +MODULE_LICENSE("GPL"); diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 554a521ee235..d55a9f8cda3d 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -59,9 +59,10 @@ torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable"); torture_param(int, shutdown_secs, 0, "Shutdown time (ms), <= zero to disable."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s."); -torture_param(int, stutter_cpus, 5, "Number of jiffies to change CPUs under test, 0=disable"); +torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); torture_param(bool, use_cpus_read_lock, 0, "Use cpus_read_lock() to exclude CPU hotplug."); torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); +torture_param(int, weight_resched, -1, "Testing weight for resched_cpu() operations."); torture_param(int, weight_single, -1, "Testing weight for single-CPU no-wait operations."); torture_param(int, weight_single_wait, -1, "Testing weight for single-CPU operations."); torture_param(int, weight_many, -1, "Testing weight for multi-CPU no-wait operations."); @@ -82,6 +83,7 @@ torture_param(bool, shutdown, SCFTORT_SHUTDOWN, "Shutdown at end of torture test struct scf_statistics { struct task_struct *task; int cpu; + long long n_resched; long long n_single; long long n_single_ofl; long long n_single_wait; @@ -97,12 +99,15 @@ static struct task_struct *scf_torture_stats_task; static DEFINE_PER_CPU(long long, scf_invoked_count); // Data for random primitive selection -#define SCF_PRIM_SINGLE 0 -#define SCF_PRIM_MANY 1 -#define SCF_PRIM_ALL 2 -#define SCF_NPRIMS (2 * 3) // Need wait and no-wait versions of each. +#define SCF_PRIM_RESCHED 0 +#define SCF_PRIM_SINGLE 1 +#define SCF_PRIM_MANY 2 +#define SCF_PRIM_ALL 3 +#define SCF_NPRIMS 7 // Need wait and no-wait versions of each, + // except for SCF_PRIM_RESCHED. static char *scf_prim_name[] = { + "resched_cpu", "smp_call_function_single", "smp_call_function_many", "smp_call_function", @@ -136,6 +141,8 @@ static char *bangstr = ""; static DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand); +extern void resched_cpu(int cpu); // An alternative IPI vector. + // Print torture statistics. Caller must ensure serialization. static void scf_torture_stats_print(void) { @@ -148,6 +155,7 @@ static void scf_torture_stats_print(void) for_each_possible_cpu(cpu) invoked_count += data_race(per_cpu(scf_invoked_count, cpu)); for (i = 0; i < nthreads; i++) { + scfs.n_resched += scf_stats_p[i].n_resched; scfs.n_single += scf_stats_p[i].n_single; scfs.n_single_ofl += scf_stats_p[i].n_single_ofl; scfs.n_single_wait += scf_stats_p[i].n_single_wait; @@ -160,8 +168,8 @@ static void scf_torture_stats_print(void) if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) || atomic_read(&n_mb_out_errs) || atomic_read(&n_alloc_errs)) bangstr = "!!! "; - pr_alert("%s %sscf_invoked_count %s: %lld single: %lld/%lld single_ofl: %lld/%lld many: %lld/%lld all: %lld/%lld ", - SCFTORT_FLAG, bangstr, isdone ? "VER" : "ver", invoked_count, + pr_alert("%s %sscf_invoked_count %s: %lld resched: %lld single: %lld/%lld single_ofl: %lld/%lld many: %lld/%lld all: %lld/%lld ", + SCFTORT_FLAG, bangstr, isdone ? "VER" : "ver", invoked_count, scfs.n_resched, scfs.n_single, scfs.n_single_wait, scfs.n_single_ofl, scfs.n_single_wait_ofl, scfs.n_many, scfs.n_many_wait, scfs.n_all, scfs.n_all_wait); torture_onoff_stats(); @@ -314,6 +322,13 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra } } switch (scfsp->scfs_prim) { + case SCF_PRIM_RESCHED: + if (IS_BUILTIN(CONFIG_SCF_TORTURE_TEST)) { + cpu = torture_random(trsp) % nr_cpu_ids; + scfp->n_resched++; + resched_cpu(cpu); + } + break; case SCF_PRIM_SINGLE: cpu = torture_random(trsp) % nr_cpu_ids; if (scfsp->scfs_wait) @@ -421,6 +436,7 @@ static int scftorture_invoker(void *arg) was_offline = false; } cond_resched(); + stutter_wait("scftorture_invoker"); } while (!torture_must_stop()); VERBOSE_SCFTORTOUT("scftorture_invoker %d ended", scfp->cpu); @@ -433,8 +449,8 @@ static void scftorture_print_module_parms(const char *tag) { pr_alert(SCFTORT_FLAG - "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter_cpus=%d use_cpus_read_lock=%d, weight_single=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag, - verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter_cpus, use_cpus_read_lock, weight_single, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait); + "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d use_cpus_read_lock=%d, weight_resched=%d, weight_single=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag, + verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter, use_cpus_read_lock, weight_resched, weight_single, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait); } static void scf_cleanup_handler(void *unused) @@ -475,6 +491,7 @@ static int __init scf_torture_init(void) { long i; int firsterr = 0; + unsigned long weight_resched1 = weight_resched; unsigned long weight_single1 = weight_single; unsigned long weight_single_wait1 = weight_single_wait; unsigned long weight_many1 = weight_many; @@ -487,9 +504,10 @@ static int __init scf_torture_init(void) scftorture_print_module_parms("Start of test"); - if (weight_single == -1 && weight_single_wait == -1 && + if (weight_resched == -1 && weight_single == -1 && weight_single_wait == -1 && weight_many == -1 && weight_many_wait == -1 && weight_all == -1 && weight_all_wait == -1) { + weight_resched1 = 2 * nr_cpu_ids; weight_single1 = 2 * nr_cpu_ids; weight_single_wait1 = 2 * nr_cpu_ids; weight_many1 = 2; @@ -497,6 +515,8 @@ static int __init scf_torture_init(void) weight_all1 = 1; weight_all_wait1 = 1; } else { + if (weight_resched == -1) + weight_resched1 = 0; if (weight_single == -1) weight_single1 = 0; if (weight_single_wait == -1) @@ -517,6 +537,10 @@ static int __init scf_torture_init(void) firsterr = -EINVAL; goto unwind; } + if (IS_BUILTIN(CONFIG_SCF_TORTURE_TEST)) + scf_sel_add(weight_resched1, SCF_PRIM_RESCHED, false); + else if (weight_resched1) + VERBOSE_SCFTORTOUT_ERRSTRING("built as module, weight_resched ignored"); scf_sel_add(weight_single1, SCF_PRIM_SINGLE, false); scf_sel_add(weight_single_wait1, SCF_PRIM_SINGLE, true); scf_sel_add(weight_many1, SCF_PRIM_MANY, false); @@ -535,6 +559,11 @@ static int __init scf_torture_init(void) if (firsterr) goto unwind; } + if (stutter > 0) { + firsterr = torture_stutter_init(stutter, stutter); + if (firsterr) + goto unwind; + } // Worker tasks invoking smp_call_function(). if (nthreads < 0) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d2003a7d5ab5..21b548b69455 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -97,7 +97,7 @@ int sysctl_sched_rt_runtime = 950000; * * Normal scheduling state is serialized by rq->lock. __schedule() takes the * local CPU's rq->lock, it optionally removes the task from the runqueue and - * always looks at the local rq data structures to find the most elegible task + * always looks at the local rq data structures to find the most eligible task * to run next. * * Task enqueue is also under rq->lock, possibly taken from another CPU. @@ -320,14 +320,6 @@ void update_rq_clock(struct rq *rq) update_rq_clock_task(rq, delta); } -static inline void -rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func) -{ - csd->flags = 0; - csd->func = func; - csd->info = rq; -} - #ifdef CONFIG_SCHED_HRTICK /* * Use HR-timers to deliver accurate preemption points. @@ -428,7 +420,7 @@ void hrtick_start(struct rq *rq, u64 delay) static void hrtick_rq_init(struct rq *rq) { #ifdef CONFIG_SMP - rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start); + INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); #endif hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); rq->hrtick_timer.function = hrtick; @@ -518,7 +510,7 @@ static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) /* * Atomically grab the task, if ->wake_q is !nil already it means - * its already queued (either by us or someone else) and will get the + * it's already queued (either by us or someone else) and will get the * wakeup due to that. * * In order to ensure that a pending wakeup will observe our pending @@ -769,7 +761,7 @@ bool sched_can_stop_tick(struct rq *rq) return false; /* - * If there are more than one RR tasks, we need the tick to effect the + * If there are more than one RR tasks, we need the tick to affect the * actual RR behaviour. */ if (rq->rt.rr_nr_running) { @@ -1187,14 +1179,14 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, * accounting was performed at enqueue time and we can just return * here. * - * Need to be careful of the following enqeueue/dequeue ordering + * Need to be careful of the following enqueue/dequeue ordering * problem too * * enqueue(taskA) * // sched_uclamp_used gets enabled * enqueue(taskB) * dequeue(taskA) - * // Must not decrement bukcet->tasks here + * // Must not decrement bucket->tasks here * dequeue(taskB) * * where we could end up with stale data in uc_se and @@ -1413,17 +1405,24 @@ done: static int uclamp_validate(struct task_struct *p, const struct sched_attr *attr) { - unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value; - unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value; + int util_min = p->uclamp_req[UCLAMP_MIN].value; + int util_max = p->uclamp_req[UCLAMP_MAX].value; - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) - lower_bound = attr->sched_util_min; - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) - upper_bound = attr->sched_util_max; + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { + util_min = attr->sched_util_min; - if (lower_bound > upper_bound) - return -EINVAL; - if (upper_bound > SCHED_CAPACITY_SCALE) + if (util_min + 1 > SCHED_CAPACITY_SCALE + 1) + return -EINVAL; + } + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { + util_max = attr->sched_util_max; + + if (util_max + 1 > SCHED_CAPACITY_SCALE + 1) + return -EINVAL; + } + + if (util_min != -1 && util_max != -1 && util_min > util_max) return -EINVAL; /* @@ -1438,20 +1437,41 @@ static int uclamp_validate(struct task_struct *p, return 0; } +static bool uclamp_reset(const struct sched_attr *attr, + enum uclamp_id clamp_id, + struct uclamp_se *uc_se) +{ + /* Reset on sched class change for a non user-defined clamp value. */ + if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) && + !uc_se->user_defined) + return true; + + /* Reset on sched_util_{min,max} == -1. */ + if (clamp_id == UCLAMP_MIN && + attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && + attr->sched_util_min == -1) { + return true; + } + + if (clamp_id == UCLAMP_MAX && + attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && + attr->sched_util_max == -1) { + return true; + } + + return false; +} + static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { enum uclamp_id clamp_id; - /* - * On scheduling class change, reset to default clamps for tasks - * without a task-specific value. - */ for_each_clamp_id(clamp_id) { struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; + unsigned int value; - /* Keep using defined clamps across class changes */ - if (uc_se->user_defined) + if (!uclamp_reset(attr, clamp_id, uc_se)) continue; /* @@ -1459,21 +1479,25 @@ static void __setscheduler_uclamp(struct task_struct *p, * at runtime. */ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) - __uclamp_update_util_min_rt_default(p); + value = sysctl_sched_uclamp_util_min_rt_default; else - uclamp_se_set(uc_se, uclamp_none(clamp_id), false); + value = uclamp_none(clamp_id); + + uclamp_se_set(uc_se, value, false); } if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) return; - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && + attr->sched_util_min != -1) { uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], attr->sched_util_min, true); } - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && + attr->sched_util_max != -1) { uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], attr->sched_util_max, true); } @@ -1696,6 +1720,76 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP +static void +__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags); + +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, + u32 flags); + +static void migrate_disable_switch(struct rq *rq, struct task_struct *p) +{ + if (likely(!p->migration_disabled)) + return; + + if (p->cpus_ptr != &p->cpus_mask) + return; + + /* + * Violates locking rules! see comment in __do_set_cpus_allowed(). + */ + __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE); +} + +void migrate_disable(void) +{ + struct task_struct *p = current; + + if (p->migration_disabled) { + p->migration_disabled++; + return; + } + + preempt_disable(); + this_rq()->nr_pinned++; + p->migration_disabled = 1; + preempt_enable(); +} +EXPORT_SYMBOL_GPL(migrate_disable); + +void migrate_enable(void) +{ + struct task_struct *p = current; + + if (p->migration_disabled > 1) { + p->migration_disabled--; + return; + } + + /* + * Ensure stop_task runs either before or after this, and that + * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). + */ + preempt_disable(); + if (p->cpus_ptr != &p->cpus_mask) + __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE); + /* + * Mustn't clear migration_disabled() until cpus_ptr points back at the + * regular cpus_mask, otherwise things that race (eg. + * select_fallback_rq) get confused. + */ + barrier(); + p->migration_disabled = 0; + this_rq()->nr_pinned--; + preempt_enable(); +} +EXPORT_SYMBOL_GPL(migrate_enable); + +static inline bool rq_has_pinned_tasks(struct rq *rq) +{ + return rq->nr_pinned; +} + /* * Per-CPU kthreads are allowed to run on !active && online CPUs, see * __set_cpus_allowed_ptr() and select_fallback_rq(). @@ -1705,7 +1799,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) if (!cpumask_test_cpu(cpu, p->cpus_ptr)) return false; - if (is_per_cpu_kthread(p)) + if (is_per_cpu_kthread(p) || is_migration_disabled(p)) return cpu_online(cpu); return cpu_active(cpu); @@ -1750,8 +1844,16 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, } struct migration_arg { - struct task_struct *task; - int dest_cpu; + struct task_struct *task; + int dest_cpu; + struct set_affinity_pending *pending; +}; + +struct set_affinity_pending { + refcount_t refs; + struct completion done; + struct cpu_stop_work stop_work; + struct migration_arg arg; }; /* @@ -1783,16 +1885,19 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, */ static int migration_cpu_stop(void *data) { + struct set_affinity_pending *pending; struct migration_arg *arg = data; struct task_struct *p = arg->task; + int dest_cpu = arg->dest_cpu; struct rq *rq = this_rq(); + bool complete = false; struct rq_flags rf; /* * The original target CPU might have gone down and we might * be on another CPU but it doesn't matter. */ - local_irq_disable(); + local_irq_save(rf.flags); /* * We need to explicitly wake pending tasks before running * __migrate_task() such that we will not miss enforcing cpus_ptr @@ -1802,21 +1907,137 @@ static int migration_cpu_stop(void *data) raw_spin_lock(&p->pi_lock); rq_lock(rq, &rf); + + pending = p->migration_pending; /* * If task_rq(p) != rq, it cannot be migrated here, because we're * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because * we're holding p->pi_lock. */ if (task_rq(p) == rq) { + if (is_migration_disabled(p)) + goto out; + + if (pending) { + p->migration_pending = NULL; + complete = true; + } + + /* migrate_enable() -- we must not race against SCA */ + if (dest_cpu < 0) { + /* + * When this was migrate_enable() but we no longer + * have a @pending, a concurrent SCA 'fixed' things + * and we should be valid again. Nothing to do. + */ + if (!pending) { + WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)); + goto out; + } + + dest_cpu = cpumask_any_distribute(&p->cpus_mask); + } + if (task_on_rq_queued(p)) - rq = __migrate_task(rq, &rf, p, arg->dest_cpu); + rq = __migrate_task(rq, &rf, p, dest_cpu); else - p->wake_cpu = arg->dest_cpu; + p->wake_cpu = dest_cpu; + + } else if (dest_cpu < 0 || pending) { + /* + * This happens when we get migrated between migrate_enable()'s + * preempt_enable() and scheduling the stopper task. At that + * point we're a regular task again and not current anymore. + * + * A !PREEMPT kernel has a giant hole here, which makes it far + * more likely. + */ + + /* + * The task moved before the stopper got to run. We're holding + * ->pi_lock, so the allowed mask is stable - if it got + * somewhere allowed, we're done. + */ + if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) { + p->migration_pending = NULL; + complete = true; + goto out; + } + + /* + * When this was migrate_enable() but we no longer have an + * @pending, a concurrent SCA 'fixed' things and we should be + * valid again. Nothing to do. + */ + if (!pending) { + WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)); + goto out; + } + + /* + * When migrate_enable() hits a rq mis-match we can't reliably + * determine is_migration_disabled() and so have to chase after + * it. + */ + task_rq_unlock(rq, p, &rf); + stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, + &pending->arg, &pending->stop_work); + return 0; } - rq_unlock(rq, &rf); - raw_spin_unlock(&p->pi_lock); +out: + task_rq_unlock(rq, p, &rf); + + if (complete) + complete_all(&pending->done); + + /* For pending->{arg,stop_work} */ + pending = arg->pending; + if (pending && refcount_dec_and_test(&pending->refs)) + wake_up_var(&pending->refs); - local_irq_enable(); + return 0; +} + +int push_cpu_stop(void *arg) +{ + struct rq *lowest_rq = NULL, *rq = this_rq(); + struct task_struct *p = arg; + + raw_spin_lock_irq(&p->pi_lock); + raw_spin_lock(&rq->lock); + + if (task_rq(p) != rq) + goto out_unlock; + + if (is_migration_disabled(p)) { + p->migration_flags |= MDF_PUSH; + goto out_unlock; + } + + p->migration_flags &= ~MDF_PUSH; + + if (p->sched_class->find_lock_rq) + lowest_rq = p->sched_class->find_lock_rq(p, rq); + + if (!lowest_rq) + goto out_unlock; + + // XXX validate p is still the highest prio task + if (task_rq(p) == rq) { + deactivate_task(rq, p, 0); + set_task_cpu(p, lowest_rq->cpu); + activate_task(lowest_rq, p, 0); + resched_curr(lowest_rq); + } + + double_unlock_balance(rq, lowest_rq); + +out_unlock: + rq->push_busy = false; + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irq(&p->pi_lock); + + put_task_struct(p); return 0; } @@ -1824,18 +2045,39 @@ static int migration_cpu_stop(void *data) * sched_class::set_cpus_allowed must do the below, but is not required to * actually call this function. */ -void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) +void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags) { + if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) { + p->cpus_ptr = new_mask; + return; + } + cpumask_copy(&p->cpus_mask, new_mask); p->nr_cpus_allowed = cpumask_weight(new_mask); } -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +static void +__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags) { struct rq *rq = task_rq(p); bool queued, running; - lockdep_assert_held(&p->pi_lock); + /* + * This here violates the locking rules for affinity, since we're only + * supposed to change these variables while holding both rq->lock and + * p->pi_lock. + * + * HOWEVER, it magically works, because ttwu() is the only code that + * accesses these variables under p->pi_lock and only does so after + * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() + * before finish_task(). + * + * XXX do further audits, this smells like something putrid. + */ + if (flags & SCA_MIGRATE_DISABLE) + SCHED_WARN_ON(!p->on_cpu); + else + lockdep_assert_held(&p->pi_lock); queued = task_on_rq_queued(p); running = task_current(rq, p); @@ -1851,7 +2093,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) if (running) put_prev_task(rq, p); - p->sched_class->set_cpus_allowed(p, new_mask); + p->sched_class->set_cpus_allowed(p, new_mask, flags); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); @@ -1859,6 +2101,208 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) set_next_task(rq, p); } +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ + __do_set_cpus_allowed(p, new_mask, 0); +} + +/* + * This function is wildly self concurrent; here be dragons. + * + * + * When given a valid mask, __set_cpus_allowed_ptr() must block until the + * designated task is enqueued on an allowed CPU. If that task is currently + * running, we have to kick it out using the CPU stopper. + * + * Migrate-Disable comes along and tramples all over our nice sandcastle. + * Consider: + * + * Initial conditions: P0->cpus_mask = [0, 1] + * + * P0@CPU0 P1 + * + * migrate_disable(); + * <preempted> + * set_cpus_allowed_ptr(P0, [1]); + * + * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes + * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region). + * This means we need the following scheme: + * + * P0@CPU0 P1 + * + * migrate_disable(); + * <preempted> + * set_cpus_allowed_ptr(P0, [1]); + * <blocks> + * <resumes> + * migrate_enable(); + * __set_cpus_allowed_ptr(); + * <wakes local stopper> + * `--> <woken on migration completion> + * + * Now the fun stuff: there may be several P1-like tasks, i.e. multiple + * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any + * task p are serialized by p->pi_lock, which we can leverage: the one that + * should come into effect at the end of the Migrate-Disable region is the last + * one. This means we only need to track a single cpumask (i.e. p->cpus_mask), + * but we still need to properly signal those waiting tasks at the appropriate + * moment. + * + * This is implemented using struct set_affinity_pending. The first + * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will + * setup an instance of that struct and install it on the targeted task_struct. + * Any and all further callers will reuse that instance. Those then wait for + * a completion signaled at the tail of the CPU stopper callback (1), triggered + * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()). + * + * + * (1) In the cases covered above. There is one more where the completion is + * signaled within affine_move_task() itself: when a subsequent affinity request + * cancels the need for an active migration. Consider: + * + * Initial conditions: P0->cpus_mask = [0, 1] + * + * P0@CPU0 P1 P2 + * + * migrate_disable(); + * <preempted> + * set_cpus_allowed_ptr(P0, [1]); + * <blocks> + * set_cpus_allowed_ptr(P0, [0, 1]); + * <signal completion> + * <awakes> + * + * Note that the above is safe vs a concurrent migrate_enable(), as any + * pending affinity completion is preceded by an uninstallation of + * p->migration_pending done with p->pi_lock held. + */ +static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf, + int dest_cpu, unsigned int flags) +{ + struct set_affinity_pending my_pending = { }, *pending = NULL; + struct migration_arg arg = { + .task = p, + .dest_cpu = dest_cpu, + }; + bool complete = false; + + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { + struct task_struct *push_task = NULL; + + if ((flags & SCA_MIGRATE_ENABLE) && + (p->migration_flags & MDF_PUSH) && !rq->push_busy) { + rq->push_busy = true; + push_task = get_task_struct(p); + } + + pending = p->migration_pending; + if (pending) { + refcount_inc(&pending->refs); + p->migration_pending = NULL; + complete = true; + } + task_rq_unlock(rq, p, rf); + + if (push_task) { + stop_one_cpu_nowait(rq->cpu, push_cpu_stop, + p, &rq->push_work); + } + + if (complete) + goto do_complete; + + return 0; + } + + if (!(flags & SCA_MIGRATE_ENABLE)) { + /* serialized by p->pi_lock */ + if (!p->migration_pending) { + /* Install the request */ + refcount_set(&my_pending.refs, 1); + init_completion(&my_pending.done); + p->migration_pending = &my_pending; + } else { + pending = p->migration_pending; + refcount_inc(&pending->refs); + } + } + pending = p->migration_pending; + /* + * - !MIGRATE_ENABLE: + * we'll have installed a pending if there wasn't one already. + * + * - MIGRATE_ENABLE: + * we're here because the current CPU isn't matching anymore, + * the only way that can happen is because of a concurrent + * set_cpus_allowed_ptr() call, which should then still be + * pending completion. + * + * Either way, we really should have a @pending here. + */ + if (WARN_ON_ONCE(!pending)) { + task_rq_unlock(rq, p, rf); + return -EINVAL; + } + + if (flags & SCA_MIGRATE_ENABLE) { + + refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ + p->migration_flags &= ~MDF_PUSH; + task_rq_unlock(rq, p, rf); + + pending->arg = (struct migration_arg) { + .task = p, + .dest_cpu = -1, + .pending = pending, + }; + + stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, + &pending->arg, &pending->stop_work); + + return 0; + } + + if (task_running(rq, p) || p->state == TASK_WAKING) { + /* + * Lessen races (and headaches) by delegating + * is_migration_disabled(p) checks to the stopper, which will + * run on the same CPU as said p. + */ + task_rq_unlock(rq, p, rf); + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); + + } else { + + if (!is_migration_disabled(p)) { + if (task_on_rq_queued(p)) + rq = move_queued_task(rq, rf, p, dest_cpu); + + p->migration_pending = NULL; + complete = true; + } + task_rq_unlock(rq, p, rf); + +do_complete: + if (complete) + complete_all(&pending->done); + } + + wait_for_completion(&pending->done); + + if (refcount_dec_and_test(&pending->refs)) + wake_up_var(&pending->refs); + + /* + * Block the original owner of &pending until all subsequent callers + * have seen the completion and decremented the refcount + */ + wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs)); + + return 0; +} + /* * Change a given task's CPU affinity. Migrate the thread to a * proper CPU and schedule it away if the CPU it's executing on @@ -1869,7 +2313,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) * call is not atomic; no spinlocks may be held. */ static int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, bool check) + const struct cpumask *new_mask, + u32 flags) { const struct cpumask *cpu_valid_mask = cpu_active_mask; unsigned int dest_cpu; @@ -1880,9 +2325,14 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, rq = task_rq_lock(p, &rf); update_rq_clock(rq); - if (p->flags & PF_KTHREAD) { + if (p->flags & PF_KTHREAD || is_migration_disabled(p)) { /* - * Kernel threads are allowed on online && !active CPUs + * Kernel threads are allowed on online && !active CPUs. + * + * Specifically, migration_disabled() tasks must not fail the + * cpumask_any_and_distribute() pick below, esp. so on + * SCA_MIGRATE_ENABLE, otherwise we'll not call + * set_cpus_allowed_common() and actually reset p->cpus_ptr. */ cpu_valid_mask = cpu_online_mask; } @@ -1891,13 +2341,22 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, * Must re-check here, to close a race against __kthread_bind(), * sched_setaffinity() is not guaranteed to observe the flag. */ - if (check && (p->flags & PF_NO_SETAFFINITY)) { + if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) { ret = -EINVAL; goto out; } - if (cpumask_equal(&p->cpus_mask, new_mask)) - goto out; + if (!(flags & SCA_MIGRATE_ENABLE)) { + if (cpumask_equal(&p->cpus_mask, new_mask)) + goto out; + + if (WARN_ON_ONCE(p == current && + is_migration_disabled(p) && + !cpumask_test_cpu(task_cpu(p), new_mask))) { + ret = -EBUSY; + goto out; + } + } /* * Picking a ~random cpu helps in cases where we are changing affinity @@ -1910,7 +2369,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, goto out; } - do_set_cpus_allowed(p, new_mask); + __do_set_cpus_allowed(p, new_mask, flags); if (p->flags & PF_KTHREAD) { /* @@ -1922,23 +2381,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, p->nr_cpus_allowed != 1); } - /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask)) - goto out; + return affine_move_task(rq, p, &rf, dest_cpu, flags); - if (task_running(rq, p) || p->state == TASK_WAKING) { - struct migration_arg arg = { p, dest_cpu }; - /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, p, &rf); - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); - return 0; - } else if (task_on_rq_queued(p)) { - /* - * OK, since we're going to drop the lock immediately - * afterwards anyway. - */ - rq = move_queued_task(rq, &rf, p, dest_cpu); - } out: task_rq_unlock(rq, p, &rf); @@ -1947,7 +2391,7 @@ out: int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { - return __set_cpus_allowed_ptr(p, new_mask, false); + return __set_cpus_allowed_ptr(p, new_mask, 0); } EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); @@ -1988,6 +2432,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * Clearly, migrating tasks to offline CPUs is a fairly daft thing. */ WARN_ON_ONCE(!cpu_online(new_cpu)); + + WARN_ON_ONCE(is_migration_disabled(p)); #endif trace_sched_migrate_task(p, new_cpu); @@ -2318,6 +2764,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p) } fallthrough; case possible: + /* + * XXX When called from select_task_rq() we only + * hold p->pi_lock and again violate locking order. + * + * More yuck to audit. + */ do_set_cpus_allowed(p, cpu_possible_mask); state = fail; break; @@ -2348,12 +2800,12 @@ out: * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. */ static inline -int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) +int select_task_rq(struct task_struct *p, int cpu, int wake_flags) { lockdep_assert_held(&p->pi_lock); - if (p->nr_cpus_allowed > 1) - cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); + if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) + cpu = p->sched_class->select_task_rq(p, cpu, wake_flags); else cpu = cpumask_any(p->cpus_ptr); @@ -2375,6 +2827,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) void sched_set_stop_task(int cpu, struct task_struct *stop) { + static struct lock_class_key stop_pi_lock; struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; struct task_struct *old_stop = cpu_rq(cpu)->stop; @@ -2390,6 +2843,20 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); stop->sched_class = &stop_sched_class; + + /* + * The PI code calls rt_mutex_setprio() with ->pi_lock held to + * adjust the effective priority of a task. As a result, + * rt_mutex_setprio() can trigger (RT) balancing operations, + * which can then trigger wakeups of the stop thread to push + * around the current task. + * + * The stop task itself will never be part of the PI-chain, it + * never blocks, therefore that ->pi_lock recursion is safe. + * Tell lockdep about this by placing the stop->pi_lock in its + * own class. + */ + lockdep_set_class(&stop->pi_lock, &stop_pi_lock); } cpu_rq(cpu)->stop = stop; @@ -2403,15 +2870,23 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) } } -#else +#else /* CONFIG_SMP */ static inline int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, bool check) + const struct cpumask *new_mask, + u32 flags) { return set_cpus_allowed_ptr(p, new_mask); } -#endif /* CONFIG_SMP */ +static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } + +static inline bool rq_has_pinned_tasks(struct rq *rq) +{ + return false; +} + +#endif /* !CONFIG_SMP */ static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) @@ -2465,7 +2940,7 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* - * Our task @p is fully woken up and running; so its safe to + * Our task @p is fully woken up and running; so it's safe to * drop the rq->lock, hereafter rq is only used for statistics. */ rq_unpin_lock(rq, rf); @@ -2501,7 +2976,12 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, #ifdef CONFIG_SMP if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; + else #endif + if (p->in_iowait) { + delayacct_blkio_end(p); + atomic_dec(&task_rq(p)->nr_iowait); + } activate_task(rq, p, en_flags); ttwu_do_wakeup(rq, p, wake_flags, rf); @@ -2888,11 +3368,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) goto unlock; - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&task_rq(p)->nr_iowait); - } - #ifdef CONFIG_SMP /* * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be @@ -2952,7 +3427,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) /* * If the owning (remote) CPU is still in the middle of schedule() with - * this task as prev, wait until its done referencing the task. + * this task as prev, wait until it's done referencing the task. * * Pairs with the smp_store_release() in finish_task(). * @@ -2961,8 +3436,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_cond_load_acquire(&p->on_cpu, !VAL); - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); + cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU); if (task_cpu(p) != cpu) { + if (p->in_iowait) { + delayacct_blkio_end(p); + atomic_dec(&task_rq(p)->nr_iowait); + } + wake_flags |= WF_MIGRATED; psi_ttwu_dequeue(p); set_task_cpu(p, cpu); @@ -3098,6 +3578,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) init_numa_balancing(clone_flags, p); #ifdef CONFIG_SMP p->wake_entry.u_flags = CSD_TYPE_TTWU; + p->migration_pending = NULL; #endif } @@ -3344,7 +3825,7 @@ void wake_up_new_task(struct task_struct *p) */ p->recent_used_cpu = task_cpu(p); rseq_migrate(p); - __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK)); #endif rq = __task_rq_lock(p, &rf); update_rq_clock(rq); @@ -3356,7 +3837,7 @@ void wake_up_new_task(struct task_struct *p) #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* - * Nothing relies on rq->lock after this, so its fine to + * Nothing relies on rq->lock after this, so it's fine to * drop it. */ rq_unpin_lock(rq, &rf); @@ -3485,6 +3966,90 @@ static inline void finish_task(struct task_struct *prev) #endif } +#ifdef CONFIG_SMP + +static void do_balance_callbacks(struct rq *rq, struct callback_head *head) +{ + void (*func)(struct rq *rq); + struct callback_head *next; + + lockdep_assert_held(&rq->lock); + + while (head) { + func = (void (*)(struct rq *))head->func; + next = head->next; + head->next = NULL; + head = next; + + func(rq); + } +} + +static inline struct callback_head *splice_balance_callbacks(struct rq *rq) +{ + struct callback_head *head = rq->balance_callback; + + lockdep_assert_held(&rq->lock); + if (head) { + rq->balance_callback = NULL; + rq->balance_flags &= ~BALANCE_WORK; + } + + return head; +} + +static void __balance_callbacks(struct rq *rq) +{ + do_balance_callbacks(rq, splice_balance_callbacks(rq)); +} + +static inline void balance_callbacks(struct rq *rq, struct callback_head *head) +{ + unsigned long flags; + + if (unlikely(head)) { + raw_spin_lock_irqsave(&rq->lock, flags); + do_balance_callbacks(rq, head); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } +} + +static void balance_push(struct rq *rq); + +static inline void balance_switch(struct rq *rq) +{ + if (likely(!rq->balance_flags)) + return; + + if (rq->balance_flags & BALANCE_PUSH) { + balance_push(rq); + return; + } + + __balance_callbacks(rq); +} + +#else + +static inline void __balance_callbacks(struct rq *rq) +{ +} + +static inline struct callback_head *splice_balance_callbacks(struct rq *rq) +{ + return NULL; +} + +static inline void balance_callbacks(struct rq *rq, struct callback_head *head) +{ +} + +static inline void balance_switch(struct rq *rq) +{ +} + +#endif + static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) { @@ -3510,6 +4075,7 @@ static inline void finish_lock_switch(struct rq *rq) * prev into current: */ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); + balance_switch(rq); raw_spin_unlock_irq(&rq->lock); } @@ -3525,6 +4091,22 @@ static inline void finish_lock_switch(struct rq *rq) # define finish_arch_post_lock_switch() do { } while (0) #endif +static inline void kmap_local_sched_out(void) +{ +#ifdef CONFIG_KMAP_LOCAL + if (unlikely(current->kmap_ctrl.idx)) + __kmap_local_sched_out(); +#endif +} + +static inline void kmap_local_sched_in(void) +{ +#ifdef CONFIG_KMAP_LOCAL + if (unlikely(current->kmap_ctrl.idx)) + __kmap_local_sched_in(); +#endif +} + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch @@ -3547,6 +4129,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, perf_event_task_sched_out(prev, next); rseq_preempt(prev); fire_sched_out_preempt_notifiers(prev, next); + kmap_local_sched_out(); prepare_task(next); prepare_arch_switch(next); } @@ -3613,6 +4196,14 @@ static struct rq *finish_task_switch(struct task_struct *prev) finish_lock_switch(rq); finish_arch_post_lock_switch(); kcov_finish_switch(current); + /* + * kmap_local_sched_out() is invoked with rq::lock held and + * interrupts disabled. There is no requirement for that, but the + * sched out code does not have an interrupt enabled section. + * Restoring the maps on sched in does not require interrupts being + * disabled either. + */ + kmap_local_sched_in(); fire_sched_in_preempt_notifiers(current); /* @@ -3651,43 +4242,6 @@ static struct rq *finish_task_switch(struct task_struct *prev) return rq; } -#ifdef CONFIG_SMP - -/* rq->lock is NOT held, but preemption is disabled */ -static void __balance_callback(struct rq *rq) -{ - struct callback_head *head, *next; - void (*func)(struct rq *rq); - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - head = rq->balance_callback; - rq->balance_callback = NULL; - while (head) { - func = (void (*)(struct rq *))head->func; - next = head->next; - head->next = NULL; - head = next; - - func(rq); - } - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - -static inline void balance_callback(struct rq *rq) -{ - if (unlikely(rq->balance_callback)) - __balance_callback(rq); -} - -#else - -static inline void balance_callback(struct rq *rq) -{ -} - -#endif - /** * schedule_tail - first thing a freshly forked thread must call. * @prev: the thread we just switched away from. @@ -3707,7 +4261,6 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) */ rq = finish_task_switch(prev); - balance_callback(rq); preempt_enable(); if (current->set_child_tid) @@ -3836,7 +4389,7 @@ unsigned long nr_iowait_cpu(int cpu) } /* - * IO-wait accounting, and how its mostly bollocks (on SMP). + * IO-wait accounting, and how it's mostly bollocks (on SMP). * * The idea behind IO-wait account is to account the idle time that we could * have spend running if it were not for IO. That is, if we were to improve the @@ -3888,7 +4441,7 @@ void sched_exec(void) int dest_cpu; raw_spin_lock_irqsave(&p->pi_lock, flags); - dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); + dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC); if (dest_cpu == smp_processor_id()) goto unlock; @@ -4291,6 +4844,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) preempt_count_set(PREEMPT_DISABLED); } rcu_sleep_check(); + SCHED_WARN_ON(ct_state() == CONTEXT_USER); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -4331,7 +4885,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* * Optimization: we know that if all tasks are in the fair class we can * call that function directly, but only if the @prev task wasn't of a - * higher scheduling class, because otherwise those loose the + * higher scheduling class, because otherwise those lose the * opportunity to pull in more work from other CPUs. */ if (likely(prev->sched_class <= &fair_sched_class && @@ -4515,6 +5069,7 @@ static void __sched notrace __schedule(bool preempt) */ ++*switch_count; + migrate_disable_switch(rq, prev); psi_sched_switch(prev, next, !task_on_rq_queued(prev)); trace_sched_switch(preempt, prev, next); @@ -4523,10 +5078,11 @@ static void __sched notrace __schedule(bool preempt) rq = context_switch(rq, prev, next, &rf); } else { rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); - rq_unlock_irq(rq, &rf); - } - balance_callback(rq); + rq_unpin_lock(rq, &rf); + __balance_callbacks(rq); + raw_spin_unlock_irq(&rq->lock); + } } void __noreturn do_task_dead(void) @@ -4630,7 +5186,7 @@ void __sched schedule_idle(void) } while (need_resched()); } -#ifdef CONFIG_CONTEXT_TRACKING +#if defined(CONFIG_CONTEXT_TRACKING) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK) asmlinkage __visible void __sched schedule_user(void) { /* @@ -4852,7 +5408,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to * ensure a task is de-boosted (pi_task is set to NULL) before the * task is allowed to run again (and can exit). This ensures the pointer - * points to a blocked task -- which guaratees the task is present. + * points to a blocked task -- which guarantees the task is present. */ p->pi_top_task = pi_task; @@ -4907,20 +5463,21 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) if (!dl_prio(p->normal_prio) || (pi_task && dl_prio(pi_task->prio) && dl_entity_preempt(&pi_task->dl, &p->dl))) { - p->dl.dl_boosted = 1; + p->dl.pi_se = pi_task->dl.pi_se; queue_flag |= ENQUEUE_REPLENISH; - } else - p->dl.dl_boosted = 0; + } else { + p->dl.pi_se = &p->dl; + } p->sched_class = &dl_sched_class; } else if (rt_prio(prio)) { if (dl_prio(oldprio)) - p->dl.dl_boosted = 0; + p->dl.pi_se = &p->dl; if (oldprio < prio) queue_flag |= ENQUEUE_HEAD; p->sched_class = &rt_sched_class; } else { if (dl_prio(oldprio)) - p->dl.dl_boosted = 0; + p->dl.pi_se = &p->dl; if (rt_prio(oldprio)) p->rt.timeout = 0; p->sched_class = &fair_sched_class; @@ -4937,9 +5494,11 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) out_unlock: /* Avoid rq from going away on us: */ preempt_disable(); - __task_rq_unlock(rq, &rf); - balance_callback(rq); + rq_unpin_lock(rq, &rf); + __balance_callbacks(rq); + raw_spin_unlock(&rq->lock); + preempt_enable(); } #else @@ -4968,7 +5527,7 @@ void set_user_nice(struct task_struct *p, long nice) /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected - * it wont have any effect on scheduling until the task is + * it won't have any effect on scheduling until the task is * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: */ if (task_has_dl_policy(p) || task_has_rt_policy(p)) { @@ -5213,6 +5772,7 @@ static int __sched_setscheduler(struct task_struct *p, int retval, oldprio, oldpolicy = -1, queued, running; int new_effective_prio, policy = attr->sched_policy; const struct sched_class *prev_class; + struct callback_head *head; struct rq_flags rf; int reset_on_fork; int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; @@ -5451,6 +6011,7 @@ change: /* Avoid rq from going away on us: */ preempt_disable(); + head = splice_balance_callbacks(rq); task_rq_unlock(rq, p, &rf); if (pi) { @@ -5459,7 +6020,7 @@ change: } /* Run balance callbacks after we've adjusted the PI chain: */ - balance_callback(rq); + balance_callbacks(rq, head); preempt_enable(); return 0; @@ -5954,7 +6515,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) } #endif again: - retval = __set_cpus_allowed_ptr(p, new_mask, true); + retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK); if (!retval) { cpuset_cpus_allowed(p, cpus_allowed); @@ -6076,14 +6637,6 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, return ret; } -/** - * sys_sched_yield - yield the current processor to other threads. - * - * This function yields the current CPU to other tasks. If there are no - * other threads running on this CPU then this function will return. - * - * Return: 0. - */ static void do_sched_yield(void) { struct rq_flags rf; @@ -6094,17 +6647,21 @@ static void do_sched_yield(void) schedstat_inc(rq->yld_count); current->sched_class->yield_task(rq); - /* - * Since we are going to call schedule() anyway, there's - * no need to preempt or enable interrupts: - */ preempt_disable(); - rq_unlock(rq, &rf); + rq_unlock_irq(rq, &rf); sched_preempt_enable_no_resched(); schedule(); } +/** + * sys_sched_yield - yield the current processor to other threads. + * + * This function yields the current CPU to other tasks. If there are no + * other threads running on this CPU then this function will return. + * + * Return: 0. + */ SYSCALL_DEFINE0(sched_yield) { do_sched_yield(); @@ -6159,7 +6716,7 @@ EXPORT_SYMBOL(__cond_resched_lock); * * The scheduler is at all times free to pick the calling task as the most * eligible task to run, if removing the yield() call from your code breaks - * it, its already broken. + * it, it's already broken. * * Typical broken usage is: * @@ -6447,6 +7004,7 @@ void sched_show_task(struct task_struct *p) (unsigned long)task_thread_info(p)->flags); print_worker_info(KERN_INFO, p); + print_stop_info(KERN_INFO, p); show_stack(p, NULL, KERN_INFO); put_task_stack(p); } @@ -6532,12 +7090,12 @@ void init_idle(struct task_struct *idle, int cpu) #ifdef CONFIG_SMP /* - * Its possible that init_idle() gets called multiple times on a task, + * It's possible that init_idle() gets called multiple times on a task, * in that case do_set_cpus_allowed() will not do the right thing. * * And since this is boot we can forgo the serialization. */ - set_cpus_allowed_common(idle, cpumask_of(cpu)); + set_cpus_allowed_common(idle, cpumask_of(cpu), 0); #endif /* * We're having a chicken and egg problem, even though we are @@ -6688,119 +7246,126 @@ void idle_task_exit(void) /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ } -/* - * Since this CPU is going 'away' for a while, fold any nr_active delta - * we might have. Assumes we're called after migrate_tasks() so that the - * nr_active count is stable. We need to take the teardown thread which - * is calling this into account, so we hand in adjust = 1 to the load - * calculation. - * - * Also see the comment "Global load-average calculations". - */ -static void calc_load_migrate(struct rq *rq) +static int __balance_push_cpu_stop(void *arg) { - long delta = calc_load_fold_active(rq, 1); - if (delta) - atomic_long_add(delta, &calc_load_tasks); -} + struct task_struct *p = arg; + struct rq *rq = this_rq(); + struct rq_flags rf; + int cpu; -static struct task_struct *__pick_migrate_task(struct rq *rq) -{ - const struct sched_class *class; - struct task_struct *next; + raw_spin_lock_irq(&p->pi_lock); + rq_lock(rq, &rf); - for_each_class(class) { - next = class->pick_next_task(rq); - if (next) { - next->sched_class->put_prev_task(rq, next); - return next; - } + update_rq_clock(rq); + + if (task_rq(p) == rq && task_on_rq_queued(p)) { + cpu = select_fallback_rq(rq->cpu, p); + rq = __migrate_task(rq, &rf, p, cpu); } - /* The idle class should always have a runnable task */ - BUG(); + rq_unlock(rq, &rf); + raw_spin_unlock_irq(&p->pi_lock); + + put_task_struct(p); + + return 0; } +static DEFINE_PER_CPU(struct cpu_stop_work, push_work); + /* - * Migrate all tasks from the rq, sleeping tasks will be migrated by - * try_to_wake_up()->select_task_rq(). - * - * Called with rq->lock held even though we'er in stop_machine() and - * there's no concurrency possible, we hold the required locks anyway - * because of lock validation efforts. + * Ensure we only run per-cpu kthreads once the CPU goes !active. */ -static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) +static void balance_push(struct rq *rq) { - struct rq *rq = dead_rq; - struct task_struct *next, *stop = rq->stop; - struct rq_flags orf = *rf; - int dest_cpu; + struct task_struct *push_task = rq->curr; + + lockdep_assert_held(&rq->lock); + SCHED_WARN_ON(rq->cpu != smp_processor_id()); /* - * Fudge the rq selection such that the below task selection loop - * doesn't get stuck on the currently eligible stop task. - * - * We're currently inside stop_machine() and the rq is either stuck - * in the stop_machine_cpu_stop() loop, or we're executing this code, - * either way we should never end up calling schedule() until we're - * done here. + * Both the cpu-hotplug and stop task are in this case and are + * required to complete the hotplug process. */ - rq->stop = NULL; + if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) { + /* + * If this is the idle task on the outgoing CPU try to wake + * up the hotplug control thread which might wait for the + * last task to vanish. The rcuwait_active() check is + * accurate here because the waiter is pinned on this CPU + * and can't obviously be running in parallel. + * + * On RT kernels this also has to check whether there are + * pinned and scheduled out tasks on the runqueue. They + * need to leave the migrate disabled section first. + */ + if (!rq->nr_running && !rq_has_pinned_tasks(rq) && + rcuwait_active(&rq->hotplug_wait)) { + raw_spin_unlock(&rq->lock); + rcuwait_wake_up(&rq->hotplug_wait); + raw_spin_lock(&rq->lock); + } + return; + } + get_task_struct(push_task); /* - * put_prev_task() and pick_next_task() sched - * class method both need to have an up-to-date - * value of rq->clock[_task] + * Temporarily drop rq->lock such that we can wake-up the stop task. + * Both preemption and IRQs are still disabled. */ - update_rq_clock(rq); + raw_spin_unlock(&rq->lock); + stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task, + this_cpu_ptr(&push_work)); + /* + * At this point need_resched() is true and we'll take the loop in + * schedule(). The next pick is obviously going to be the stop task + * which is_per_cpu_kthread() and will push this task away. + */ + raw_spin_lock(&rq->lock); +} - for (;;) { - /* - * There's this thread running, bail when that's the only - * remaining thread: - */ - if (rq->nr_running == 1) - break; +static void balance_push_set(int cpu, bool on) +{ + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; - next = __pick_migrate_task(rq); + rq_lock_irqsave(rq, &rf); + if (on) + rq->balance_flags |= BALANCE_PUSH; + else + rq->balance_flags &= ~BALANCE_PUSH; + rq_unlock_irqrestore(rq, &rf); +} - /* - * Rules for changing task_struct::cpus_mask are holding - * both pi_lock and rq->lock, such that holding either - * stabilizes the mask. - * - * Drop rq->lock is not quite as disastrous as it usually is - * because !cpu_active at this point, which means load-balance - * will not interfere. Also, stop-machine. - */ - rq_unlock(rq, rf); - raw_spin_lock(&next->pi_lock); - rq_relock(rq, rf); +/* + * Invoked from a CPUs hotplug control thread after the CPU has been marked + * inactive. All tasks which are not per CPU kernel threads are either + * pushed off this CPU now via balance_push() or placed on a different CPU + * during wakeup. Wait until the CPU is quiescent. + */ +static void balance_hotplug_wait(void) +{ + struct rq *rq = this_rq(); - /* - * Since we're inside stop-machine, _nothing_ should have - * changed the task, WARN if weird stuff happened, because in - * that case the above rq->lock drop is a fail too. - */ - if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { - raw_spin_unlock(&next->pi_lock); - continue; - } + rcuwait_wait_event(&rq->hotplug_wait, + rq->nr_running == 1 && !rq_has_pinned_tasks(rq), + TASK_UNINTERRUPTIBLE); +} - /* Find suitable destination for @next, with force if needed. */ - dest_cpu = select_fallback_rq(dead_rq->cpu, next); - rq = __migrate_task(rq, rf, next, dest_cpu); - if (rq != dead_rq) { - rq_unlock(rq, rf); - rq = dead_rq; - *rf = orf; - rq_relock(rq, rf); - } - raw_spin_unlock(&next->pi_lock); - } +#else + +static inline void balance_push(struct rq *rq) +{ +} + +static inline void balance_push_set(int cpu, bool on) +{ +} - rq->stop = stop; +static inline void balance_hotplug_wait(void) +{ } + #endif /* CONFIG_HOTPLUG_CPU */ void set_rq_online(struct rq *rq) @@ -6886,6 +7451,8 @@ int sched_cpu_activate(unsigned int cpu) struct rq *rq = cpu_rq(cpu); struct rq_flags rf; + balance_push_set(cpu, false); + #ifdef CONFIG_SCHED_SMT /* * When going up, increment the number of cores with SMT present. @@ -6921,6 +7488,8 @@ int sched_cpu_activate(unsigned int cpu) int sched_cpu_deactivate(unsigned int cpu) { + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; int ret; set_cpu_active(cpu, false); @@ -6933,6 +7502,16 @@ int sched_cpu_deactivate(unsigned int cpu) */ synchronize_rcu(); + balance_push_set(cpu, true); + + rq_lock_irqsave(rq, &rf); + if (rq->rd) { + update_rq_clock(rq); + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + rq_unlock_irqrestore(rq, &rf); + #ifdef CONFIG_SCHED_SMT /* * When going down, decrement the number of cores with SMT present. @@ -6946,6 +7525,7 @@ int sched_cpu_deactivate(unsigned int cpu) ret = cpuset_cpu_inactive(cpu); if (ret) { + balance_push_set(cpu, false); set_cpu_active(cpu, true); return ret; } @@ -6969,6 +7549,41 @@ int sched_cpu_starting(unsigned int cpu) } #ifdef CONFIG_HOTPLUG_CPU + +/* + * Invoked immediately before the stopper thread is invoked to bring the + * CPU down completely. At this point all per CPU kthreads except the + * hotplug thread (current) and the stopper thread (inactive) have been + * either parked or have been unbound from the outgoing CPU. Ensure that + * any of those which might be on the way out are gone. + * + * If after this point a bound task is being woken on this CPU then the + * responsible hotplug callback has failed to do it's job. + * sched_cpu_dying() will catch it with the appropriate fireworks. + */ +int sched_cpu_wait_empty(unsigned int cpu) +{ + balance_hotplug_wait(); + return 0; +} + +/* + * Since this CPU is going 'away' for a while, fold any nr_active delta we + * might have. Called from the CPU stopper task after ensuring that the + * stopper is the last running task on the CPU, so nr_active count is + * stable. We need to take the teardown thread which is calling this into + * account, so we hand in adjust = 1 to the load calculation. + * + * Also see the comment "Global load-average calculations". + */ +static void calc_load_migrate(struct rq *rq) +{ + long delta = calc_load_fold_active(rq, 1); + + if (delta) + atomic_long_add(delta, &calc_load_tasks); +} + int sched_cpu_dying(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); @@ -6978,12 +7593,7 @@ int sched_cpu_dying(unsigned int cpu) sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_offline(rq); - } - migrate_tasks(rq, &rf); - BUG_ON(rq->nr_running != 1); + BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq)); rq_unlock_irqrestore(rq, &rf); calc_load_migrate(rq); @@ -7188,7 +7798,10 @@ void __init sched_init(void) rq->last_blocked_load_update_tick = jiffies; atomic_set(&rq->nohz_flags, 0); - rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func); + INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq); +#endif +#ifdef CONFIG_HOTPLUG_CPU + rcuwait_init(&rq->hotplug_wait); #endif #endif /* CONFIG_SMP */ hrtick_rq_init(rq); @@ -7327,6 +7940,39 @@ void __cant_sleep(const char *file, int line, int preempt_offset) add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } EXPORT_SYMBOL_GPL(__cant_sleep); + +#ifdef CONFIG_SMP +void __cant_migrate(const char *file, int line) +{ + static unsigned long prev_jiffy; + + if (irqs_disabled()) + return; + + if (is_migration_disabled(current)) + return; + + if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) + return; + + if (preempt_count() > 0) + return; + + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + + pr_err("BUG: assuming non migratable context at %s:%d\n", file, line); + pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), is_migration_disabled(current), + current->pid, current->comm); + + debug_show_held_locks(current); + dump_stack(); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); +} +EXPORT_SYMBOL_GPL(__cant_migrate); +#endif #endif #ifdef CONFIG_MAGIC_SYSRQ @@ -7660,7 +8306,7 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) return -EINVAL; #endif /* - * Serialize against wake_up_new_task() such that if its + * Serialize against wake_up_new_task() such that if it's * running, we're sure to observe its full state. */ raw_spin_lock_irq(&task->pi_lock); diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 8cb06c8c7eb1..ceb03d76c0cc 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -120,7 +120,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, const struct sched_dl_entity *dl_se = &p->dl; if (later_mask && - cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { + cpumask_and(later_mask, cp->free_cpus, &p->cpus_mask)) { unsigned long cap, max_cap = 0; int cpu, max_cpu = -1; @@ -151,7 +151,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); - if (cpumask_test_cpu(best_cpu, p->cpus_ptr) && + if (cpumask_test_cpu(best_cpu, &p->cpus_mask) && dl_time_before(dl_se->deadline, cp->elements[0].dl)) { if (later_mask) cpumask_set_cpu(best_cpu, later_mask); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e254745a82cb..b0ad37bf95ee 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -102,7 +102,9 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { - if (sg_policy->next_freq == next_freq) + if (sg_policy->need_freq_update) + sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); + else if (sg_policy->next_freq == next_freq) return false; sg_policy->next_freq = next_freq; @@ -164,7 +166,6 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) return sg_policy->next_freq; - sg_policy->need_freq_update = false; sg_policy->cached_raw_freq = freq; return cpufreq_driver_resolve_freq(policy, freq); } @@ -440,7 +441,6 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, struct sugov_policy *sg_policy = sg_cpu->sg_policy; unsigned long util, max; unsigned int next_f; - bool busy; unsigned int cached_freq = sg_policy->cached_raw_freq; sugov_iowait_boost(sg_cpu, time, flags); @@ -451,9 +451,6 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, if (!sugov_should_update_freq(sg_policy, time)) return; - /* Limits may have changed, don't skip frequency update */ - busy = !sg_policy->need_freq_update && sugov_cpu_is_busy(sg_cpu); - util = sugov_get_util(sg_cpu); max = sg_cpu->max; util = sugov_iowait_apply(sg_cpu, time, util, max); @@ -462,7 +459,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, * Do not reduce the frequency if the CPU has not been idle * recently, as the reduction is likely to be premature then. */ - if (busy && next_f < sg_policy->next_freq) { + if (sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) { next_f = sg_policy->next_freq; /* Restore cached freq as next_freq has changed */ @@ -827,9 +824,10 @@ static int sugov_start(struct cpufreq_policy *policy) sg_policy->next_freq = 0; sg_policy->work_in_progress = false; sg_policy->limits_changed = false; - sg_policy->need_freq_update = false; sg_policy->cached_raw_freq = 0; + sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); + for_each_cpu(cpu, policy->cpus) { struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); @@ -881,7 +879,7 @@ static void sugov_limits(struct cpufreq_policy *policy) struct cpufreq_governor schedutil_gov = { .name = "schedutil", .owner = THIS_MODULE, - .dynamic_switching = true, + .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING, .init = sugov_init, .exit = sugov_exit, .start = sugov_start, @@ -899,16 +897,9 @@ struct cpufreq_governor *cpufreq_default_governor(void) cpufreq_governor_init(schedutil_gov); #ifdef CONFIG_ENERGY_MODEL -extern bool sched_energy_update; -extern struct mutex sched_energy_mutex; - static void rebuild_sd_workfn(struct work_struct *work) { - mutex_lock(&sched_energy_mutex); - sched_energy_update = true; - rebuild_sched_domains(); - sched_energy_update = false; - mutex_unlock(&sched_energy_mutex); + rebuild_sched_domains_energy(); } static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 0033731a0797..ec9be789c7e2 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -11,7 +11,7 @@ * This code tracks the priority of each CPU so that global migration * decisions are easy to calculate. Each CPU can be in a state as follows: * - * (INVALID), IDLE, NORMAL, RT1, ... RT99 + * (INVALID), NORMAL, RT1, ... RT99, HIGHER * * going from the lowest priority to the highest. CPUs in the INVALID state * are not eligible for routing. The system maintains this state with @@ -19,24 +19,48 @@ * in that class). Therefore a typical application without affinity * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit * searches). For tasks with affinity restrictions, the algorithm has a - * worst case complexity of O(min(102, nr_domcpus)), though the scenario that + * worst case complexity of O(min(101, nr_domcpus)), though the scenario that * yields the worst case search is fairly contrived. */ #include "sched.h" -/* Convert between a 140 based task->prio, and our 102 based cpupri */ +/* + * p->rt_priority p->prio newpri cpupri + * + * -1 -1 (CPUPRI_INVALID) + * + * 99 0 (CPUPRI_NORMAL) + * + * 1 98 98 1 + * ... + * 49 50 50 49 + * 50 49 49 50 + * ... + * 99 0 0 99 + * + * 100 100 (CPUPRI_HIGHER) + */ static int convert_prio(int prio) { int cpupri; - if (prio == CPUPRI_INVALID) - cpupri = CPUPRI_INVALID; - else if (prio == MAX_PRIO) - cpupri = CPUPRI_IDLE; - else if (prio >= MAX_RT_PRIO) - cpupri = CPUPRI_NORMAL; - else - cpupri = MAX_RT_PRIO - prio + 1; + switch (prio) { + case CPUPRI_INVALID: + cpupri = CPUPRI_INVALID; /* -1 */ + break; + + case 0 ... 98: + cpupri = MAX_RT_PRIO-1 - prio; /* 1 ... 99 */ + break; + + case MAX_RT_PRIO-1: + cpupri = CPUPRI_NORMAL; /* 0 */ + break; + + case MAX_RT_PRIO: + cpupri = CPUPRI_HIGHER; /* 100 */ + break; + } return cpupri; } @@ -73,11 +97,11 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p, if (skip) return 0; - if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) + if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids) return 0; if (lowest_mask) { - cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); + cpumask_and(lowest_mask, &p->cpus_mask, vec->mask); /* * We have to ensure that we have at least one bit @@ -177,7 +201,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, * cpupri_set - update the CPU priority setting * @cp: The cpupri context * @cpu: The target CPU - * @newpri: The priority (INVALID-RT99) to assign to this CPU + * @newpri: The priority (INVALID,NORMAL,RT1-RT99,HIGHER) to assign to this CPU * * Note: Assumes cpu_rq(cpu)->lock is locked * diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index efbb492bb94c..d6cba0020064 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -1,11 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) +#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO+1) #define CPUPRI_INVALID -1 -#define CPUPRI_IDLE 0 -#define CPUPRI_NORMAL 1 -/* values 2-101 are RT priorities 0-99 */ +#define CPUPRI_NORMAL 0 +/* values 1-99 are for RT1-RT99 priorities */ +#define CPUPRI_HIGHER 100 struct cpupri_vec { atomic_t count; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5a55d2300452..5f611658eeab 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -44,12 +44,13 @@ static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, } /* - * Called before incrementing preempt_count on {soft,}irq_enter + * Called after incrementing preempt_count on {soft,}irq_enter * and before decrementing preempt_count on {soft,}irq_exit. */ -void irqtime_account_irq(struct task_struct *curr) +void irqtime_account_irq(struct task_struct *curr, unsigned int offset) { struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); + unsigned int pc; s64 delta; int cpu; @@ -59,6 +60,7 @@ void irqtime_account_irq(struct task_struct *curr) cpu = smp_processor_id(); delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; irqtime->irq_start_time += delta; + pc = preempt_count() - offset; /* * We do not account for softirq time from ksoftirqd here. @@ -66,12 +68,11 @@ void irqtime_account_irq(struct task_struct *curr) * in that case, so as not to confuse scheduler with a special task * that do not consume any time, but still wants to run. */ - if (hardirq_count()) + if (pc & HARDIRQ_MASK) irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); - else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) + else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd()) irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); } -EXPORT_SYMBOL_GPL(irqtime_account_irq); static u64 irqtime_tick_accounted(u64 maxtime) { @@ -418,24 +419,21 @@ void vtime_task_switch(struct task_struct *prev) } # endif -/* - * Archs that account the whole time spent in the idle task - * (outside irq) as idle time can rely on this and just implement - * vtime_account_kernel() and vtime_account_idle(). Archs that - * have other meaning of the idle time (s390 only includes the - * time spent by the CPU when it's in low power mode) must override - * vtime_account(). - */ -#ifndef __ARCH_HAS_VTIME_ACCOUNT -void vtime_account_irq_enter(struct task_struct *tsk) +void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { - if (!in_interrupt() && is_idle_task(tsk)) + unsigned int pc = preempt_count() - offset; + + if (pc & HARDIRQ_OFFSET) { + vtime_account_hardirq(tsk); + } else if (pc & SOFTIRQ_OFFSET) { + vtime_account_softirq(tsk); + } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) && + is_idle_task(tsk)) { vtime_account_idle(tsk); - else + } else { vtime_account_kernel(tsk); + } } -EXPORT_SYMBOL_GPL(vtime_account_irq_enter); -#endif /* __ARCH_HAS_VTIME_ACCOUNT */ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, u64 *ut, u64 *st) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f232305dcefe..75686c6d4436 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -43,6 +43,28 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se) return !RB_EMPTY_NODE(&dl_se->rb_node); } +#ifdef CONFIG_RT_MUTEXES +static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) +{ + return dl_se->pi_se; +} + +static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) +{ + return pi_of(dl_se) != dl_se; +} +#else +static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) +{ + return dl_se; +} + +static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) +{ + return false; +} +#endif + #ifdef CONFIG_SMP static inline struct dl_bw *dl_bw_of(int i) { @@ -97,6 +119,17 @@ static inline unsigned long dl_bw_capacity(int i) return __dl_bw_capacity(i); } } + +static inline bool dl_bw_visited(int cpu, u64 gen) +{ + struct root_domain *rd = cpu_rq(cpu)->rd; + + if (rd->visit_gen == gen) + return true; + + rd->visit_gen = gen; + return false; +} #else static inline struct dl_bw *dl_bw_of(int i) { @@ -112,6 +145,11 @@ static inline unsigned long dl_bw_capacity(int i) { return SCHED_CAPACITY_SCALE; } + +static inline bool dl_bw_visited(int cpu, u64 gen) +{ + return false; +} #endif static inline @@ -543,7 +581,7 @@ static int push_dl_task(struct rq *rq); static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) { - return dl_task(prev); + return rq->online && dl_task(prev); } static DEFINE_PER_CPU(struct callback_head, dl_push_head); @@ -698,7 +736,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - WARN_ON(dl_se->dl_boosted); + WARN_ON(is_dl_boosted(dl_se)); WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); /* @@ -736,21 +774,20 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) * could happen are, typically, a entity voluntarily trying to overcome its * runtime, or it just underestimated it during sched_setattr(). */ -static void replenish_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se) +static void replenish_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - BUG_ON(pi_se->dl_runtime <= 0); + BUG_ON(pi_of(dl_se)->dl_runtime <= 0); /* * This could be the case for a !-dl task that is boosted. * Just go with full inherited parameters. */ if (dl_se->dl_deadline == 0) { - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; } if (dl_se->dl_yielded && dl_se->runtime > 0) @@ -763,8 +800,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * arbitrary large. */ while (dl_se->runtime <= 0) { - dl_se->deadline += pi_se->dl_period; - dl_se->runtime += pi_se->dl_runtime; + dl_se->deadline += pi_of(dl_se)->dl_period; + dl_se->runtime += pi_of(dl_se)->dl_runtime; } /* @@ -778,8 +815,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { printk_deferred_once("sched: DL replenish lagged too much\n"); - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; } if (dl_se->dl_yielded) @@ -812,8 +849,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * task with deadline equal to period this is the same of using * dl_period instead of dl_deadline in the equation above. */ -static bool dl_entity_overflow(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se, u64 t) +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t) { u64 left, right; @@ -835,9 +871,9 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, * of anything below microseconds resolution is actually fiction * (but still we want to give the user that illusion >;). */ - left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); + left = (pi_of(dl_se)->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); right = ((dl_se->deadline - t) >> DL_SCALE) * - (pi_se->dl_runtime >> DL_SCALE); + (pi_of(dl_se)->dl_runtime >> DL_SCALE); return dl_time_before(right, left); } @@ -922,24 +958,23 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) * Please refer to the comments update_dl_revised_wakeup() function to find * more about the Revised CBS rule. */ -static void update_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se) +static void update_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); if (dl_time_before(dl_se->deadline, rq_clock(rq)) || - dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { + dl_entity_overflow(dl_se, rq_clock(rq))) { if (unlikely(!dl_is_implicit(dl_se) && !dl_time_before(dl_se->deadline, rq_clock(rq)) && - !dl_se->dl_boosted)){ + !is_dl_boosted(dl_se))) { update_dl_revised_wakeup(dl_se, rq); return; } - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; } } @@ -1038,7 +1073,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * The task might have been boosted by someone else and might be in the * boosting/deboosting path, its not throttled. */ - if (dl_se->dl_boosted) + if (is_dl_boosted(dl_se)) goto unlock; /* @@ -1066,7 +1101,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * but do not enqueue -- wait for our wakeup to do that. */ if (!task_on_rq_queued(p)) { - replenish_dl_entity(dl_se, dl_se); + replenish_dl_entity(dl_se); goto unlock; } @@ -1156,7 +1191,7 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) if (dl_time_before(dl_se->deadline, rq_clock(rq)) && dl_time_before(rq_clock(rq), dl_next_period(dl_se))) { - if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p))) return; dl_se->dl_throttled = 1; if (dl_se->runtime > 0) @@ -1287,7 +1322,7 @@ throttle: dl_se->dl_overrun = 1; __dequeue_task_dl(rq, curr, 0); - if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr))) enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); if (!is_leftmost(curr, &rq->dl)) @@ -1378,6 +1413,8 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) if (dl_rq->earliest_dl.curr == 0 || dl_time_before(deadline, dl_rq->earliest_dl.curr)) { + if (dl_rq->earliest_dl.curr == 0) + cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_HIGHER); dl_rq->earliest_dl.curr = deadline; cpudl_set(&rq->rd->cpudl, rq->cpu, deadline); } @@ -1395,6 +1432,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) dl_rq->earliest_dl.curr = 0; dl_rq->earliest_dl.next = 0; cpudl_clear(&rq->rd->cpudl, rq->cpu); + cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); } else { struct rb_node *leftmost = dl_rq->root.rb_leftmost; struct sched_dl_entity *entry; @@ -1481,8 +1519,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) } static void -enqueue_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se, int flags) +enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) { BUG_ON(on_dl_rq(dl_se)); @@ -1493,9 +1530,9 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, */ if (flags & ENQUEUE_WAKEUP) { task_contending(dl_se, flags); - update_dl_entity(dl_se, pi_se); + update_dl_entity(dl_se); } else if (flags & ENQUEUE_REPLENISH) { - replenish_dl_entity(dl_se, pi_se); + replenish_dl_entity(dl_se); } else if ((flags & ENQUEUE_RESTORE) && dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) { @@ -1512,19 +1549,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se) static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) { - struct task_struct *pi_task = rt_mutex_get_top_task(p); - struct sched_dl_entity *pi_se = &p->dl; - - /* - * Use the scheduling parameters of the top pi-waiter task if: - * - we have a top pi-waiter which is a SCHED_DEADLINE task AND - * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is - * smaller than our deadline OR we are a !SCHED_DEADLINE task getting - * boosted due to a SCHED_DEADLINE pi-waiter). - * Otherwise we keep our runtime and deadline. - */ - if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) { - pi_se = &pi_task->dl; + if (is_dl_boosted(&p->dl)) { /* * Because of delays in the detection of the overrun of a * thread's runtime, it might be the case that a thread @@ -1557,7 +1582,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * the throttle. */ p->dl.dl_throttled = 0; - BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); + BUG_ON(!is_dl_boosted(&p->dl) || flags != ENQUEUE_REPLENISH); return; } @@ -1594,7 +1619,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) return; } - enqueue_dl_entity(&p->dl, pi_se, flags); + enqueue_dl_entity(&p->dl, flags); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); @@ -1664,13 +1689,13 @@ static void yield_task_dl(struct rq *rq) static int find_later_rq(struct task_struct *task); static int -select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_dl(struct task_struct *p, int cpu, int flags) { struct task_struct *curr; bool select_rq; struct rq *rq; - if (sd_flag != SD_BALANCE_WAKE) + if (!(flags & WF_TTWU)) goto out; rq = cpu_rq(cpu); @@ -1912,7 +1937,7 @@ static void task_fork_dl(struct task_struct *p) static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - cpumask_test_cpu(cpu, p->cpus_ptr)) + cpumask_test_cpu(cpu, &p->cpus_mask)) return 1; return 0; } @@ -2002,8 +2027,8 @@ static int find_later_rq(struct task_struct *task) return this_cpu; } - best_cpu = cpumask_first_and(later_mask, - sched_domain_span(sd)); + best_cpu = cpumask_any_and_distribute(later_mask, + sched_domain_span(sd)); /* * Last chance: if a CPU being in both later_mask * and current sd span is valid, that becomes our @@ -2025,7 +2050,7 @@ static int find_later_rq(struct task_struct *task) if (this_cpu != -1) return this_cpu; - cpu = cpumask_any(later_mask); + cpu = cpumask_any_distribute(later_mask); if (cpu < nr_cpu_ids) return cpu; @@ -2062,7 +2087,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) /* Retry if something changed. */ if (double_lock_balance(rq, later_rq)) { if (unlikely(task_rq(task) != rq || - !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) || + !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) || task_running(rq, task) || !dl_task(task) || !task_on_rq_queued(task))) { @@ -2129,6 +2154,9 @@ static int push_dl_task(struct rq *rq) return 0; retry: + if (is_migration_disabled(next_task)) + return 0; + if (WARN_ON(next_task == rq->curr)) return 0; @@ -2206,7 +2234,7 @@ static void push_dl_tasks(struct rq *rq) static void pull_dl_task(struct rq *this_rq) { int this_cpu = this_rq->cpu, cpu; - struct task_struct *p; + struct task_struct *p, *push_task; bool resched = false; struct rq *src_rq; u64 dmin = LONG_MAX; @@ -2236,6 +2264,7 @@ static void pull_dl_task(struct rq *this_rq) continue; /* Might drop this_rq->lock */ + push_task = NULL; double_lock_balance(this_rq, src_rq); /* @@ -2267,17 +2296,27 @@ static void pull_dl_task(struct rq *this_rq) src_rq->curr->dl.deadline)) goto skip; - resched = true; - - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); - dmin = p->dl.deadline; + if (is_migration_disabled(p)) { + push_task = get_push_task(src_rq); + } else { + deactivate_task(src_rq, p, 0); + set_task_cpu(p, this_cpu); + activate_task(this_rq, p, 0); + dmin = p->dl.deadline; + resched = true; + } /* Is there any other task even earlier? */ } skip: double_unlock_balance(this_rq, src_rq); + + if (push_task) { + raw_spin_unlock(&this_rq->lock); + stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, + push_task, &src_rq->push_work); + raw_spin_lock(&this_rq->lock); + } } if (resched) @@ -2301,7 +2340,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) } static void set_cpus_allowed_dl(struct task_struct *p, - const struct cpumask *new_mask) + const struct cpumask *new_mask, + u32 flags) { struct root_domain *src_rd; struct rq *rq; @@ -2330,7 +2370,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, raw_spin_unlock(&src_dl_b->lock); } - set_cpus_allowed_common(p, new_mask); + set_cpus_allowed_common(p, new_mask, flags); } /* Assumes rq->lock is held */ @@ -2503,8 +2543,8 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, } } -const struct sched_class dl_sched_class - __section("__dl_sched_class") = { +DEFINE_SCHED_CLASS(dl) = { + .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, @@ -2523,6 +2563,7 @@ const struct sched_class dl_sched_class .rq_online = rq_online_dl, .rq_offline = rq_offline_dl, .task_woken = task_woken_dl, + .find_lock_rq = find_lock_later_rq, #endif .task_tick = task_tick_dl, @@ -2535,33 +2576,39 @@ const struct sched_class dl_sched_class .update_curr = update_curr_dl, }; +/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */ +static u64 dl_generation; + int sched_dl_global_validate(void) { u64 runtime = global_rt_runtime(); u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); + u64 gen = ++dl_generation; struct dl_bw *dl_b; - int cpu, ret = 0; + int cpu, cpus, ret = 0; unsigned long flags; /* * Here we want to check the bandwidth not being set to some * value smaller than the currently allocated bandwidth in * any of the root_domains. - * - * FIXME: Cycling on all the CPUs is overdoing, but simpler than - * cycling on root_domains... Discussion on different/better - * solutions is welcome! */ for_each_possible_cpu(cpu) { rcu_read_lock_sched(); + + if (dl_bw_visited(cpu, gen)) + goto next; + dl_b = dl_bw_of(cpu); + cpus = dl_bw_cpus(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - if (new_bw < dl_b->total_bw) + if (new_bw * cpus < dl_b->total_bw) ret = -EBUSY; raw_spin_unlock_irqrestore(&dl_b->lock, flags); +next: rcu_read_unlock_sched(); if (ret) @@ -2587,6 +2634,7 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) void sched_dl_do_global(void) { u64 new_bw = -1; + u64 gen = ++dl_generation; struct dl_bw *dl_b; int cpu; unsigned long flags; @@ -2597,11 +2645,14 @@ void sched_dl_do_global(void) if (global_rt_runtime() != RUNTIME_INF) new_bw = to_ratio(global_rt_period(), global_rt_runtime()); - /* - * FIXME: As above... - */ for_each_possible_cpu(cpu) { rcu_read_lock_sched(); + + if (dl_bw_visited(cpu, gen)) { + rcu_read_unlock_sched(); + continue; + } + dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); @@ -2787,11 +2838,14 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_bw = 0; dl_se->dl_density = 0; - dl_se->dl_boosted = 0; dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; dl_se->dl_non_contending = 0; dl_se->dl_overrun = 0; + +#ifdef CONFIG_RT_MUTEXES + dl_se->pi_se = dl_se; +#endif } bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0655524700d2..2357921580f9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -251,7 +251,7 @@ static int sd_ctl_doflags(struct ctl_table *table, int write, unsigned long flags = *(unsigned long *)table->data; size_t data_size = 0; size_t len = 0; - char *tmp; + char *tmp, *buf; int idx; if (write) @@ -269,17 +269,17 @@ static int sd_ctl_doflags(struct ctl_table *table, int write, return 0; } - tmp = kcalloc(data_size + 1, sizeof(*tmp), GFP_KERNEL); - if (!tmp) + buf = kcalloc(data_size + 1, sizeof(*buf), GFP_KERNEL); + if (!buf) return -ENOMEM; for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { char *name = sd_flag_debug[idx].name; - len += snprintf(tmp + len, strlen(name) + 2, "%s ", name); + len += snprintf(buf + len, strlen(name) + 2, "%s ", name); } - tmp += *ppos; + tmp = buf + *ppos; len -= *ppos; if (len > *lenp) @@ -294,7 +294,7 @@ static int sd_ctl_doflags(struct ctl_table *table, int write, *lenp = len; *ppos += len; - kfree(tmp); + kfree(buf); return 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 290f9e38378c..04a3ce20da67 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -906,6 +906,15 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) if (!schedstat_enabled()) return; + /* + * When the sched_schedstat changes from 0 to 1, some sched se + * maybe already in the runqueue, the se->statistics.wait_start + * will be 0.So it will let the delta wrong. We need to avoid this + * scenario. + */ + if (unlikely(!schedstat_val(se->statistics.wait_start))) + return; + delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); if (entity_is_task(se)) { @@ -1550,7 +1559,8 @@ struct task_numa_env { static unsigned long cpu_load(struct rq *rq); static unsigned long cpu_runnable(struct rq *rq); static unsigned long cpu_util(int cpu); -static inline long adjust_numa_imbalance(int imbalance, int nr_running); +static inline long adjust_numa_imbalance(int imbalance, + int dst_running, int dst_weight); static inline enum numa_type numa_classify(unsigned int imbalance_pct, @@ -1930,7 +1940,8 @@ static void task_numa_find_cpu(struct task_numa_env *env, src_running = env->src_stats.nr_running - 1; dst_running = env->dst_stats.nr_running + 1; imbalance = max(0, dst_running - src_running); - imbalance = adjust_numa_imbalance(imbalance, dst_running); + imbalance = adjust_numa_imbalance(imbalance, dst_running, + env->dst_stats.weight); /* Use idle CPU if there is no imbalance */ if (!imbalance) { @@ -4779,25 +4790,37 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ if (!se->on_rq) - break; + goto done; - if (dequeue) { - dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); - } else { - update_load_avg(qcfs_rq, se, 0); - se_update_runnable(se); - } + dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; - if (qcfs_rq->load.weight) - dequeue = 0; + if (qcfs_rq->load.weight) { + /* Avoid re-evaluating load for this entity: */ + se = parent_entity(se); + break; + } } - if (!se) - sub_nr_running(rq, task_delta); + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + /* throttled entity or throttle-on-deactivate */ + if (!se->on_rq) + goto done; + + update_load_avg(qcfs_rq, se, 0); + se_update_runnable(se); + + qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->idle_h_nr_running -= idle_task_delta; + } + /* At this point se is NULL and we are at root level*/ + sub_nr_running(rq, task_delta); + +done: /* * Note: distribution will already see us throttled via the * throttled-list. rq->lock protects completion. @@ -5105,9 +5128,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) return; distribute_cfs_runtime(cfs_b); - - raw_spin_lock_irqsave(&cfs_b->lock, flags); - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); } /* @@ -5477,6 +5497,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int idle_h_nr_running = task_has_idle_policy(p); + int task_new = !(flags & ENQUEUE_WAKEUP); /* * The code below (indirectly) updates schedutil which looks at @@ -5549,7 +5570,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * into account, but that is not straightforward to implement, * and the following generally works well enough in practice. */ - if (flags & ENQUEUE_WAKEUP) + if (!task_new) update_overutilized_status(rq); enqueue_throttle: @@ -5804,6 +5825,9 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync) if (sync && cpu_rq(this_cpu)->nr_running == 1) return this_cpu; + if (available_idle_cpu(prev_cpu)) + return prev_cpu; + return nr_cpumask_bits; } @@ -6062,10 +6086,11 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int break; } } - cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); if (idle) return core; + + cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); } /* @@ -6172,21 +6197,21 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t static int select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) { - unsigned long best_cap = 0; + unsigned long task_util, best_cap = 0; int cpu, best_cpu = -1; struct cpumask *cpus; - sync_entity_load_avg(&p->se); - cpus = this_cpu_cpumask_var_ptr(select_idle_mask); cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + task_util = uclamp_task_util(p); + for_each_cpu_wrap(cpu, cpus, target) { unsigned long cpu_cap = capacity_of(cpu); if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) continue; - if (task_fits_capacity(p, cpu_cap)) + if (fits_capacity(task_util, cpu_cap)) return cpu; if (cpu_cap > best_cap) { @@ -6198,44 +6223,42 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) return best_cpu; } +static inline bool asym_fits_capacity(int task_util, int cpu) +{ + if (static_branch_unlikely(&sched_asym_cpucapacity)) + return fits_capacity(task_util, capacity_of(cpu)); + + return true; +} + /* * Try and locate an idle core/thread in the LLC cache domain. */ static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; + unsigned long task_util; int i, recent_used_cpu; /* - * For asymmetric CPU capacity systems, our domain of interest is - * sd_asym_cpucapacity rather than sd_llc. + * On asymmetric system, update task utilization because we will check + * that the task fits with cpu's capacity. */ if (static_branch_unlikely(&sched_asym_cpucapacity)) { - sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); - /* - * On an asymmetric CPU capacity system where an exclusive - * cpuset defines a symmetric island (i.e. one unique - * capacity_orig value through the cpuset), the key will be set - * but the CPUs within that cpuset will not have a domain with - * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric - * capacity path. - */ - if (!sd) - goto symmetric; - - i = select_idle_capacity(p, sd, target); - return ((unsigned)i < nr_cpumask_bits) ? i : target; + sync_entity_load_avg(&p->se); + task_util = uclamp_task_util(p); } -symmetric: - if (available_idle_cpu(target) || sched_idle_cpu(target)) + if ((available_idle_cpu(target) || sched_idle_cpu(target)) && + asym_fits_capacity(task_util, target)) return target; /* * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && - (available_idle_cpu(prev) || sched_idle_cpu(prev))) + (available_idle_cpu(prev) || sched_idle_cpu(prev)) && + asym_fits_capacity(task_util, prev)) return prev; /* @@ -6258,7 +6281,8 @@ symmetric: recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && - cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) { + cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && + asym_fits_capacity(task_util, recent_used_cpu)) { /* * Replace recent_used_cpu with prev as it is a potential * candidate for the next wake: @@ -6267,6 +6291,26 @@ symmetric: return recent_used_cpu; } + /* + * For asymmetric CPU capacity systems, our domain of interest is + * sd_asym_cpucapacity rather than sd_llc. + */ + if (static_branch_unlikely(&sched_asym_cpucapacity)) { + sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); + /* + * On an asymmetric CPU capacity system where an exclusive + * cpuset defines a symmetric island (i.e. one unique + * capacity_orig value through the cpuset), the key will be set + * but the CPUs within that cpuset will not have a domain with + * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric + * capacity path. + */ + if (sd) { + i = select_idle_capacity(p, sd, target); + return ((unsigned)i < nr_cpumask_bits) ? i : target; + } + } + sd = rcu_dereference(per_cpu(sd_llc, target)); if (!sd) return target; @@ -6287,7 +6331,7 @@ symmetric: } /** - * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks + * cpu_util - Estimates the amount of capacity of a CPU used by CFS tasks. * @cpu: the CPU to get the utilization of * * The unit of the return value must be the one of capacity so we can compare @@ -6663,7 +6707,7 @@ fail: /* * select_task_rq_fair: Select target runqueue for the waking task in domains - * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, + * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE, * SD_BALANCE_FORK, or SD_BALANCE_EXEC. * * Balances load by selecting the idlest CPU in the idlest group, or under @@ -6674,15 +6718,17 @@ fail: * preempt must be disabled. */ static int -select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) { + int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); struct sched_domain *tmp, *sd = NULL; int cpu = smp_processor_id(); int new_cpu = prev_cpu; int want_affine = 0; - int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); + /* SD_flags and WF_flags share the first nibble */ + int sd_flag = wake_flags & 0xF; - if (sd_flag & SD_BALANCE_WAKE) { + if (wake_flags & WF_TTWU) { record_wakee(p); if (sched_energy_enabled()) { @@ -6719,9 +6765,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (unlikely(sd)) { /* Slow path */ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); - } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ + } else if (wake_flags & WF_TTWU) { /* XXX always ? */ /* Fast path */ - new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); if (want_affine) @@ -8738,6 +8783,16 @@ static bool update_pick_idlest(struct sched_group *idlest, } /* + * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain. + * This is an approximation as the number of running tasks may not be + * related to the number of busy CPUs due to sched_setaffinity. + */ +static inline bool allow_numa_imbalance(int dst_running, int dst_weight) +{ + return (dst_running < (dst_weight >> 2)); +} + +/* * find_idlest_group() finds and returns the least busy CPU group within the * domain. * @@ -8755,9 +8810,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) .group_type = group_overloaded, }; - imbalance = scale_load_down(NICE_0_LOAD) * - (sd->imbalance_pct-100) / 100; - do { int local_group; @@ -8811,6 +8863,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) switch (local_sgs.group_type) { case group_overloaded: case group_fully_busy: + + /* Calculate allowed imbalance based on load */ + imbalance = scale_load_down(NICE_0_LOAD) * + (sd->imbalance_pct-100) / 100; + /* * When comparing groups across NUMA domains, it's possible for * the local domain to be very lightly loaded relative to the @@ -8867,7 +8924,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) * a real need of migration, periodic load balance will * take care of it. */ - if (local_sgs.idle_cpus) + if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight)) return NULL; } @@ -8969,16 +9026,19 @@ next_group: } } -static inline long adjust_numa_imbalance(int imbalance, int nr_running) +#define NUMA_IMBALANCE_MIN 2 + +static inline long adjust_numa_imbalance(int imbalance, + int dst_running, int dst_weight) { - unsigned int imbalance_min; + if (!allow_numa_imbalance(dst_running, dst_weight)) + return imbalance; /* * Allow a small imbalance based on a simple pair of communicating - * tasks that remain local when the source domain is almost idle. + * tasks that remain local when the destination is lightly loaded. */ - imbalance_min = 2; - if (nr_running <= imbalance_min) + if (imbalance <= NUMA_IMBALANCE_MIN) return 0; return imbalance; @@ -9031,7 +9091,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * emptying busiest. */ if (local->group_type == group_has_spare) { - if (busiest->group_type > group_fully_busy) { + if ((busiest->group_type > group_fully_busy) && + !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) { /* * If busiest is overloaded, try to fill spare * capacity. This might end up creating spare capacity @@ -9080,9 +9141,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s } /* Consider allowing a small imbalance between NUMA groups */ - if (env->sd->flags & SD_NUMA) + if (env->sd->flags & SD_NUMA) { env->imbalance = adjust_numa_imbalance(env->imbalance, - busiest->sum_nr_running); + busiest->sum_nr_running, busiest->group_weight); + } return; } @@ -10047,6 +10109,10 @@ static inline int find_new_ilb(void) for_each_cpu_and(ilb, nohz.idle_cpus_mask, housekeeping_cpumask(HK_FLAG_MISC)) { + + if (ilb == smp_processor_id()) + continue; + if (idle_cpu(ilb)) return ilb; } @@ -10484,7 +10550,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } #endif /* CONFIG_NO_HZ_COMMON */ /* - * idle_balance is called by schedule() if this_cpu is about to become + * newidle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. * * Returns: @@ -11158,8 +11224,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task /* * All the scheduling class methods: */ -const struct sched_class fair_sched_class - __section("__fair_sched_class") = { +DEFINE_SCHED_CLASS(fair) = { + .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 24d0ee26377d..305727ea0677 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -78,7 +78,7 @@ void __weak arch_cpu_idle_dead(void) { } void __weak arch_cpu_idle(void) { cpu_idle_force_poll = 1; - local_irq_enable(); + raw_local_irq_enable(); } /** @@ -94,9 +94,35 @@ void __cpuidle default_idle_call(void) trace_cpu_idle(1, smp_processor_id()); stop_critical_timings(); + + /* + * arch_cpu_idle() is supposed to enable IRQs, however + * we can't do that because of RCU and tracing. + * + * Trace IRQs enable here, then switch off RCU, and have + * arch_cpu_idle() use raw_local_irq_enable(). Note that + * rcu_idle_enter() relies on lockdep IRQ state, so switch that + * last -- this is very similar to the entry code. + */ + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(_THIS_IP_); rcu_idle_enter(); + lockdep_hardirqs_on(_THIS_IP_); + arch_cpu_idle(); + + /* + * OK, so IRQs are enabled here, but RCU needs them disabled to + * turn itself back on.. funny thing is that disabling IRQs + * will cause tracing, which needs RCU. Jump through hoops to + * make it 'work'. + */ + raw_local_irq_disable(); + lockdep_hardirqs_off(_THIS_IP_); rcu_idle_exit(); + lockdep_hardirqs_on(_THIS_IP_); + raw_local_irq_enable(); + start_critical_timings(); trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } @@ -338,6 +364,7 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns) WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY)); WARN_ON_ONCE(!duration_ns); + WARN_ON_ONCE(current->mm); rcu_sleep_check(); preempt_disable(); @@ -375,7 +402,7 @@ void cpu_startup_entry(enum cpuhp_state state) #ifdef CONFIG_SMP static int -select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_idle(struct task_struct *p, int cpu, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } @@ -457,8 +484,8 @@ static void update_curr_idle(struct rq *rq) /* * Simple, special scheduling class for the per-CPU idle tasks: */ -const struct sched_class idle_sched_class - __section("__idle_sched_class") = { +DEFINE_SCHED_CLASS(idle) = { + /* no enqueue/yield_task for idle tasks */ /* dequeue is not valid, we print a debug message there: */ diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index e23e74d52db5..08ae45ad9261 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -7,6 +7,134 @@ #include "sched.h" /* + * For documentation purposes, here are some membarrier ordering + * scenarios to keep in mind: + * + * A) Userspace thread execution after IPI vs membarrier's memory + * barrier before sending the IPI + * + * Userspace variables: + * + * int x = 0, y = 0; + * + * The memory barrier at the start of membarrier() on CPU0 is necessary in + * order to enforce the guarantee that any writes occurring on CPU0 before + * the membarrier() is executed will be visible to any code executing on + * CPU1 after the IPI-induced memory barrier: + * + * CPU0 CPU1 + * + * x = 1 + * membarrier(): + * a: smp_mb() + * b: send IPI IPI-induced mb + * c: smp_mb() + * r2 = y + * y = 1 + * barrier() + * r1 = x + * + * BUG_ON(r1 == 0 && r2 == 0) + * + * The write to y and load from x by CPU1 are unordered by the hardware, + * so it's possible to have "r1 = x" reordered before "y = 1" at any + * point after (b). If the memory barrier at (a) is omitted, then "x = 1" + * can be reordered after (a) (although not after (c)), so we get r1 == 0 + * and r2 == 0. This violates the guarantee that membarrier() is + * supposed by provide. + * + * The timing of the memory barrier at (a) has to ensure that it executes + * before the IPI-induced memory barrier on CPU1. + * + * B) Userspace thread execution before IPI vs membarrier's memory + * barrier after completing the IPI + * + * Userspace variables: + * + * int x = 0, y = 0; + * + * The memory barrier at the end of membarrier() on CPU0 is necessary in + * order to enforce the guarantee that any writes occurring on CPU1 before + * the membarrier() is executed will be visible to any code executing on + * CPU0 after the membarrier(): + * + * CPU0 CPU1 + * + * x = 1 + * barrier() + * y = 1 + * r2 = y + * membarrier(): + * a: smp_mb() + * b: send IPI IPI-induced mb + * c: smp_mb() + * r1 = x + * BUG_ON(r1 == 0 && r2 == 1) + * + * The writes to x and y are unordered by the hardware, so it's possible to + * have "r2 = 1" even though the write to x doesn't execute until (b). If + * the memory barrier at (c) is omitted then "r1 = x" can be reordered + * before (b) (although not before (a)), so we get "r1 = 0". This violates + * the guarantee that membarrier() is supposed to provide. + * + * The timing of the memory barrier at (c) has to ensure that it executes + * after the IPI-induced memory barrier on CPU1. + * + * C) Scheduling userspace thread -> kthread -> userspace thread vs membarrier + * + * CPU0 CPU1 + * + * membarrier(): + * a: smp_mb() + * d: switch to kthread (includes mb) + * b: read rq->curr->mm == NULL + * e: switch to user (includes mb) + * c: smp_mb() + * + * Using the scenario from (A), we can show that (a) needs to be paired + * with (e). Using the scenario from (B), we can show that (c) needs to + * be paired with (d). + * + * D) exit_mm vs membarrier + * + * Two thread groups are created, A and B. Thread group B is created by + * issuing clone from group A with flag CLONE_VM set, but not CLONE_THREAD. + * Let's assume we have a single thread within each thread group (Thread A + * and Thread B). Thread A runs on CPU0, Thread B runs on CPU1. + * + * CPU0 CPU1 + * + * membarrier(): + * a: smp_mb() + * exit_mm(): + * d: smp_mb() + * e: current->mm = NULL + * b: read rq->curr->mm == NULL + * c: smp_mb() + * + * Using scenario (B), we can show that (c) needs to be paired with (d). + * + * E) kthread_{use,unuse}_mm vs membarrier + * + * CPU0 CPU1 + * + * membarrier(): + * a: smp_mb() + * kthread_unuse_mm() + * d: smp_mb() + * e: current->mm = NULL + * b: read rq->curr->mm == NULL + * kthread_use_mm() + * f: current->mm = mm + * g: smp_mb() + * c: smp_mb() + * + * Using the scenario from (A), we can show that (a) needs to be paired + * with (g). Using the scenario from (B), we can show that (c) needs to + * be paired with (d). + */ + +/* * Bitmask made from a "or" of all commands within enum membarrier_cmd, * except MEMBARRIER_CMD_QUERY. */ @@ -38,8 +166,33 @@ static void ipi_mb(void *info) smp_mb(); /* IPIs should be serializing but paranoid. */ } +static void ipi_sync_core(void *info) +{ + /* + * The smp_mb() in membarrier after all the IPIs is supposed to + * ensure that memory on remote CPUs that occur before the IPI + * become visible to membarrier()'s caller -- see scenario B in + * the big comment at the top of this file. + * + * A sync_core() would provide this guarantee, but + * sync_core_before_usermode() might end up being deferred until + * after membarrier()'s smp_mb(). + */ + smp_mb(); /* IPIs should be serializing but paranoid. */ + + sync_core_before_usermode(); +} + static void ipi_rseq(void *info) { + /* + * Ensure that all stores done by the calling thread are visible + * to the current task before the current task resumes. We could + * probably optimize this away on most architectures, but by the + * time we've already sent an IPI, the cost of the extra smp_mb() + * is negligible. + */ + smp_mb(); rseq_preempt(current); } @@ -76,6 +229,18 @@ void membarrier_exec_mmap(struct mm_struct *mm) this_cpu_write(runqueues.membarrier_state, 0); } +void membarrier_update_current_mm(struct mm_struct *next_mm) +{ + struct rq *rq = this_rq(); + int membarrier_state = 0; + + if (next_mm) + membarrier_state = atomic_read(&next_mm->membarrier_state); + if (READ_ONCE(rq->membarrier_state) == membarrier_state) + return; + WRITE_ONCE(rq->membarrier_state, membarrier_state); +} + static int membarrier_global_expedited(void) { int cpu; @@ -114,12 +279,11 @@ static int membarrier_global_expedited(void) continue; /* - * Skip the CPU if it runs a kernel thread. The scheduler - * leaves the prior task mm in place as an optimization when - * scheduling a kthread. + * Skip the CPU if it runs a kernel thread which is not using + * a task mm. */ p = rcu_dereference(cpu_rq(cpu)->curr); - if (p->flags & PF_KTHREAD) + if (!p->mm) continue; __cpumask_set_cpu(cpu, tmpmask); @@ -154,6 +318,7 @@ static int membarrier_private_expedited(int flags, int cpu_id) if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) return -EPERM; + ipi_func = ipi_sync_core; } else if (flags == MEMBARRIER_FLAG_RSEQ) { if (!IS_ENABLED(CONFIG_RSEQ)) return -EINVAL; @@ -168,7 +333,8 @@ static int membarrier_private_expedited(int flags, int cpu_id) return -EPERM; } - if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) + if (flags != MEMBARRIER_FLAG_SYNC_CORE && + (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)) return 0; /* @@ -187,8 +353,6 @@ static int membarrier_private_expedited(int flags, int cpu_id) if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id)) goto out; - if (cpu_id == raw_smp_processor_id()) - goto out; rcu_read_lock(); p = rcu_dereference(cpu_rq(cpu_id)->curr); if (!p || p->mm != mm) { @@ -203,16 +367,6 @@ static int membarrier_private_expedited(int flags, int cpu_id) for_each_online_cpu(cpu) { struct task_struct *p; - /* - * Skipping the current CPU is OK even through we can be - * migrated at any point. The current CPU, at the point - * where we read raw_smp_processor_id(), is ensured to - * be in program order with respect to the caller - * thread. Therefore, we can skip this CPU from the - * iteration. - */ - if (cpu == raw_smp_processor_id()) - continue; p = rcu_dereference(cpu_rq(cpu)->curr); if (p && p->mm == mm) __cpumask_set_cpu(cpu, tmpmask); @@ -220,12 +374,38 @@ static int membarrier_private_expedited(int flags, int cpu_id) rcu_read_unlock(); } - preempt_disable(); - if (cpu_id >= 0) + if (cpu_id >= 0) { + /* + * smp_call_function_single() will call ipi_func() if cpu_id + * is the calling CPU. + */ smp_call_function_single(cpu_id, ipi_func, NULL, 1); - else - smp_call_function_many(tmpmask, ipi_func, NULL, 1); - preempt_enable(); + } else { + /* + * For regular membarrier, we can save a few cycles by + * skipping the current cpu -- we're about to do smp_mb() + * below, and if we migrate to a different cpu, this cpu + * and the new cpu will execute a full barrier in the + * scheduler. + * + * For SYNC_CORE, we do need a barrier on the current cpu -- + * otherwise, if we are migrated and replaced by a different + * task in the same mm just before, during, or after + * membarrier, we will end up with some thread in the mm + * running without a core sync. + * + * For RSEQ, don't rseq_preempt() the caller. User code + * is not supposed to issue syscalls at all from inside an + * rseq critical section. + */ + if (flags != MEMBARRIER_FLAG_SYNC_CORE) { + preempt_disable(); + smp_call_function_many(tmpmask, ipi_func, NULL, true); + preempt_enable(); + } else { + on_each_cpu_mask(tmpmask, ipi_func, NULL, true); + } + } out: if (cpu_id < 0) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 49ec096a8aa1..dbe4629cf7ba 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -89,8 +89,8 @@ void init_rt_rq(struct rt_rq *rt_rq) __set_bit(MAX_RT_PRIO, array->bitmap); #if defined CONFIG_SMP - rt_rq->highest_prio.curr = MAX_RT_PRIO; - rt_rq->highest_prio.next = MAX_RT_PRIO; + rt_rq->highest_prio.curr = MAX_RT_PRIO-1; + rt_rq->highest_prio.next = MAX_RT_PRIO-1; rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); @@ -161,7 +161,7 @@ void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, { struct rq *rq = cpu_rq(cpu); - rt_rq->highest_prio.curr = MAX_RT_PRIO; + rt_rq->highest_prio.curr = MAX_RT_PRIO-1; rt_rq->rt_nr_boosted = 0; rt_rq->rq = rq; rt_rq->tg = tg; @@ -265,7 +265,7 @@ static void pull_rt_task(struct rq *this_rq); static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) { /* Try to pull RT tasks here if we lower this rq's prio */ - return rq->rt.highest_prio.curr > prev->prio; + return rq->online && rq->rt.highest_prio.curr > prev->prio; } static inline int rt_overloaded(struct rq *rq) @@ -393,8 +393,9 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) p = plist_first_entry(&rq->rt.pushable_tasks, struct task_struct, pushable_tasks); rq->rt.highest_prio.next = p->prio; - } else - rq->rt.highest_prio.next = MAX_RT_PRIO; + } else { + rq->rt.highest_prio.next = MAX_RT_PRIO-1; + } } #else @@ -1147,8 +1148,9 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio) sched_find_first_bit(array->bitmap); } - } else - rt_rq->highest_prio.curr = MAX_RT_PRIO; + } else { + rt_rq->highest_prio.curr = MAX_RT_PRIO-1; + } dec_rt_prio_smp(rt_rq, prio, prev_prio); } @@ -1428,14 +1430,14 @@ static void yield_task_rt(struct rq *rq) static int find_lowest_rq(struct task_struct *task); static int -select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_rt(struct task_struct *p, int cpu, int flags) { struct task_struct *curr; struct rq *rq; bool test; /* For anything but wake ups, just return the task_cpu */ - if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) + if (!(flags & (WF_TTWU | WF_FORK))) goto out; rq = cpu_rq(cpu); @@ -1658,7 +1660,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - cpumask_test_cpu(cpu, p->cpus_ptr)) + cpumask_test_cpu(cpu, &p->cpus_mask)) return 1; return 0; @@ -1752,8 +1754,8 @@ static int find_lowest_rq(struct task_struct *task) return this_cpu; } - best_cpu = cpumask_first_and(lowest_mask, - sched_domain_span(sd)); + best_cpu = cpumask_any_and_distribute(lowest_mask, + sched_domain_span(sd)); if (best_cpu < nr_cpu_ids) { rcu_read_unlock(); return best_cpu; @@ -1770,7 +1772,7 @@ static int find_lowest_rq(struct task_struct *task) if (this_cpu != -1) return this_cpu; - cpu = cpumask_any(lowest_mask); + cpu = cpumask_any_distribute(lowest_mask); if (cpu < nr_cpu_ids) return cpu; @@ -1811,7 +1813,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) * Also make sure that it wasn't scheduled on its rq. */ if (unlikely(task_rq(task) != rq || - !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) || + !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || task_running(rq, task) || !rt_task(task) || !task_on_rq_queued(task))) { @@ -1859,7 +1861,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) * running task can migrate over to a CPU that is running a task * of lesser priority. */ -static int push_rt_task(struct rq *rq) +static int push_rt_task(struct rq *rq, bool pull) { struct task_struct *next_task; struct rq *lowest_rq; @@ -1873,6 +1875,34 @@ static int push_rt_task(struct rq *rq) return 0; retry: + if (is_migration_disabled(next_task)) { + struct task_struct *push_task = NULL; + int cpu; + + if (!pull || rq->push_busy) + return 0; + + cpu = find_lowest_rq(rq->curr); + if (cpu == -1 || cpu == rq->cpu) + return 0; + + /* + * Given we found a CPU with lower priority than @next_task, + * therefore it should be running. However we cannot migrate it + * to this other CPU, instead attempt to push the current + * running task on this CPU away. + */ + push_task = get_push_task(rq); + if (push_task) { + raw_spin_unlock(&rq->lock); + stop_one_cpu_nowait(rq->cpu, push_cpu_stop, + push_task, &rq->push_work); + raw_spin_lock(&rq->lock); + } + + return 0; + } + if (WARN_ON(next_task == rq->curr)) return 0; @@ -1927,12 +1957,10 @@ retry: deactivate_task(rq, next_task, 0); set_task_cpu(next_task, lowest_rq->cpu); activate_task(lowest_rq, next_task, 0); - ret = 1; - resched_curr(lowest_rq); + ret = 1; double_unlock_balance(rq, lowest_rq); - out: put_task_struct(next_task); @@ -1942,7 +1970,7 @@ out: static void push_rt_tasks(struct rq *rq) { /* push_rt_task will return true if it moved an RT */ - while (push_rt_task(rq)) + while (push_rt_task(rq, false)) ; } @@ -2095,7 +2123,8 @@ void rto_push_irq_work_func(struct irq_work *work) */ if (has_pushable_tasks(rq)) { raw_spin_lock(&rq->lock); - push_rt_tasks(rq); + while (push_rt_task(rq, true)) + ; raw_spin_unlock(&rq->lock); } @@ -2120,7 +2149,7 @@ static void pull_rt_task(struct rq *this_rq) { int this_cpu = this_rq->cpu, cpu; bool resched = false; - struct task_struct *p; + struct task_struct *p, *push_task; struct rq *src_rq; int rt_overload_count = rt_overloaded(this_rq); @@ -2167,6 +2196,7 @@ static void pull_rt_task(struct rq *this_rq) * double_lock_balance, and another CPU could * alter this_rq */ + push_task = NULL; double_lock_balance(this_rq, src_rq); /* @@ -2194,11 +2224,14 @@ static void pull_rt_task(struct rq *this_rq) if (p->prio < src_rq->curr->prio) goto skip; - resched = true; - - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); + if (is_migration_disabled(p)) { + push_task = get_push_task(src_rq); + } else { + deactivate_task(src_rq, p, 0); + set_task_cpu(p, this_cpu); + activate_task(this_rq, p, 0); + resched = true; + } /* * We continue with the search, just in * case there's an even higher prio task @@ -2208,6 +2241,13 @@ static void pull_rt_task(struct rq *this_rq) } skip: double_unlock_balance(this_rq, src_rq); + + if (push_task) { + raw_spin_unlock(&this_rq->lock); + stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, + push_task, &src_rq->push_work); + raw_spin_lock(&this_rq->lock); + } } if (resched) @@ -2429,8 +2469,8 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) return 0; } -const struct sched_class rt_sched_class - __section("__rt_sched_class") = { +DEFINE_SCHED_CLASS(rt) = { + .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, @@ -2449,6 +2489,7 @@ const struct sched_class rt_sched_class .rq_offline = rq_offline_rt, .task_woken = task_woken_rt, .switched_from = switched_from_rt, + .find_lock_rq = find_lock_lowest_rq, #endif .task_tick = task_tick_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index df80bfcea92e..f5acb6c5ce49 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -67,7 +67,6 @@ #include <linux/tsacct_kern.h> #include <asm/tlb.h> -#include <asm-generic/vmlinux.lds.h> #ifdef CONFIG_PARAVIRT # include <asm/paravirt.h> @@ -257,30 +256,6 @@ struct rt_bandwidth { void __dl_clear_params(struct task_struct *p); -/* - * To keep the bandwidth of -deadline tasks and groups under control - * we need some place where: - * - store the maximum -deadline bandwidth of the system (the group); - * - cache the fraction of that bandwidth that is currently allocated. - * - * This is all done in the data structure below. It is similar to the - * one used for RT-throttling (rt_bandwidth), with the main difference - * that, since here we are only interested in admission control, we - * do not decrease any runtime while the group "executes", neither we - * need a timer to replenish it. - * - * With respect to SMP, the bandwidth is given on a per-CPU basis, - * meaning that: - * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; - * - dl_total_bw array contains, in the i-eth element, the currently - * allocated bandwidth on the i-eth CPU. - * Moreover, groups consume bandwidth on each CPU, while tasks only - * consume bandwidth on the CPU they're running on. - * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw - * that will be shown the next time the proc or cgroup controls will - * be red. It on its turn can be changed by writing on its own - * control. - */ struct dl_bandwidth { raw_spinlock_t dl_runtime_lock; u64 dl_runtime; @@ -292,6 +267,24 @@ static inline int dl_bandwidth_enabled(void) return sysctl_sched_rt_runtime >= 0; } +/* + * To keep the bandwidth of -deadline tasks under control + * we need some place where: + * - store the maximum -deadline bandwidth of each cpu; + * - cache the fraction of bandwidth that is currently allocated in + * each root domain; + * + * This is all done in the data structure below. It is similar to the + * one used for RT-throttling (rt_bandwidth), with the main difference + * that, since here we are only interested in admission control, we + * do not decrease any runtime while the group "executes", neither we + * need a timer to replenish it. + * + * With respect to SMP, bandwidth is given on a per root domain basis, + * meaning that: + * - bw (< 100%) is the deadline bandwidth of each CPU; + * - total_bw is the currently allocated bandwidth in each root domain; + */ struct dl_bw { raw_spinlock_t lock; u64 bw; @@ -801,6 +794,15 @@ struct root_domain { struct dl_bw dl_bw; struct cpudl cpudl; + /* + * Indicate whether a root_domain's dl_bw has been checked or + * updated. It's monotonously increasing value. + * + * Also, some corner cases, like 'wrap around' is dangerous, but given + * that u64 is 'big enough'. So that shouldn't be a concern. + */ + u64 visit_gen; + #ifdef HAVE_RT_PUSH_IPI /* * For IPI pull requests, loop across the rto_mask. @@ -973,6 +975,7 @@ struct rq { unsigned long cpu_capacity_orig; struct callback_head *balance_callback; + unsigned char balance_flags; unsigned char nohz_idle_balance; unsigned char idle_balance; @@ -1003,6 +1006,10 @@ struct rq { /* This is used to determine avg_idle's max value */ u64 max_idle_balance_cost; + +#ifdef CONFIG_HOTPLUG_CPU + struct rcuwait hotplug_wait; +#endif #endif /* CONFIG_SMP */ #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -1048,6 +1055,12 @@ struct rq { /* Must be inspected within a rcu lock section */ struct cpuidle_state *idle_state; #endif + +#ifdef CONFIG_SMP + unsigned int nr_pinned; +#endif + unsigned int push_busy; + struct cpu_stop_work push_work; }; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1075,6 +1088,16 @@ static inline int cpu_of(struct rq *rq) #endif } +#define MDF_PUSH 0x01 + +static inline bool is_migration_disabled(struct task_struct *p) +{ +#ifdef CONFIG_SMP + return p->migration_disabled; +#else + return false; +#endif +} #ifdef CONFIG_SCHED_SMT extern void __update_idle_core(struct rq *rq); @@ -1221,6 +1244,9 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); rf->clock_update_flags = 0; #endif +#ifdef CONFIG_SMP + SCHED_WARN_ON(rq->balance_callback); +#endif } static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) @@ -1382,6 +1408,9 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_SMP +#define BALANCE_WORK 0x01 +#define BALANCE_PUSH 0x02 + static inline void queue_balance_callback(struct rq *rq, struct callback_head *head, @@ -1389,12 +1418,13 @@ queue_balance_callback(struct rq *rq, { lockdep_assert_held(&rq->lock); - if (unlikely(head->next)) + if (unlikely(head->next || (rq->balance_flags & BALANCE_PUSH))) return; head->func = (void (*)(struct callback_head *))func; head->next = rq->balance_callback; rq->balance_callback = head; + rq->balance_flags |= BALANCE_WORK; } #define rcu_dereference_check_sched_domain(p) \ @@ -1714,13 +1744,20 @@ static inline int task_on_rq_migrating(struct task_struct *p) return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; } -/* - * wake flags - */ -#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ -#define WF_FORK 0x02 /* Child wakeup after fork */ -#define WF_MIGRATED 0x04 /* Internal use, task got migrated */ -#define WF_ON_CPU 0x08 /* Wakee is on_cpu */ +/* Wake flags. The first three directly map to some SD flag value */ +#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */ +#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */ +#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */ + +#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ +#define WF_MIGRATED 0x20 /* Internal use, task got migrated */ +#define WF_ON_CPU 0x40 /* Wakee is on_cpu */ + +#ifdef CONFIG_SMP +static_assert(WF_EXEC == SD_BALANCE_EXEC); +static_assert(WF_FORK == SD_BALANCE_FORK); +static_assert(WF_TTWU == SD_BALANCE_WAKE); +#endif /* * To aid in avoiding the subversion of "niceness" due to uneven distribution @@ -1796,16 +1833,19 @@ struct sched_class { #ifdef CONFIG_SMP int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); - int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); + int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); void (*migrate_task_rq)(struct task_struct *p, int new_cpu); void (*task_woken)(struct rq *this_rq, struct task_struct *task); void (*set_cpus_allowed)(struct task_struct *p, - const struct cpumask *newmask); + const struct cpumask *newmask, + u32 flags); void (*rq_online)(struct rq *rq); void (*rq_offline)(struct rq *rq); + + struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); #endif void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); @@ -1833,7 +1873,7 @@ struct sched_class { #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_change_group)(struct task_struct *p, int type); #endif -} __aligned(STRUCT_ALIGNMENT); /* STRUCT_ALIGN(), vmlinux.lds.h */ +}; static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { @@ -1847,6 +1887,20 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next) next->sched_class->set_next_task(rq, next, false); } + +/* + * Helper to define a sched_class instance; each one is placed in a separate + * section which is ordered by the linker script: + * + * include/asm-generic/vmlinux.lds.h + * + * Also enforce alignment on the instance, not the type, to guarantee layout. + */ +#define DEFINE_SCHED_CLASS(name) \ +const struct sched_class name##_sched_class \ + __aligned(__alignof__(struct sched_class)) \ + __section("__" #name "_sched_class") + /* Defined in include/asm-generic/vmlinux.lds.h */ extern struct sched_class __begin_sched_classes[]; extern struct sched_class __end_sched_classes[]; @@ -1889,13 +1943,35 @@ static inline bool sched_fair_runnable(struct rq *rq) extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); extern struct task_struct *pick_next_task_idle(struct rq *rq); +#define SCA_CHECK 0x01 +#define SCA_MIGRATE_DISABLE 0x02 +#define SCA_MIGRATE_ENABLE 0x04 + #ifdef CONFIG_SMP extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); -extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); +extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags); + +static inline struct task_struct *get_push_task(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + lockdep_assert_held(&rq->lock); + + if (rq->push_busy) + return NULL; + + if (p->nr_cpus_allowed == 1) + return NULL; + + rq->push_busy = true; + return get_task_struct(p); +} + +extern int push_cpu_stop(void *arg); #endif diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index ceb5b6b12561..55f39125c0e1 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -11,7 +11,7 @@ #ifdef CONFIG_SMP static int -select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_stop(struct task_struct *p, int cpu, int flags) { return task_cpu(p); /* stop tasks as never migrate */ } @@ -109,8 +109,7 @@ static void update_curr_stop(struct rq *rq) /* * Simple, special scheduling class for the per-CPU stop tasks: */ -const struct sched_class stop_sched_class - __section("__stop_sched_class") = { +DEFINE_SCHED_CLASS(stop) = { .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index dd7770226086..5d3675c7a76b 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -211,6 +211,15 @@ unsigned int sysctl_sched_energy_aware = 1; DEFINE_MUTEX(sched_energy_mutex); bool sched_energy_update; +void rebuild_sched_domains_energy(void) +{ + mutex_lock(&sched_energy_mutex); + sched_energy_update = true; + rebuild_sched_domains(); + sched_energy_update = false; + mutex_unlock(&sched_energy_mutex); +} + #ifdef CONFIG_PROC_SYSCTL int sched_energy_aware_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -223,13 +232,8 @@ int sched_energy_aware_handler(struct ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { state = static_branch_unlikely(&sched_energy_present); - if (state != sysctl_sched_energy_aware) { - mutex_lock(&sched_energy_mutex); - sched_energy_update = 1; - rebuild_sched_domains(); - sched_energy_update = 0; - mutex_unlock(&sched_energy_mutex); - } + if (state != sysctl_sched_energy_aware) + rebuild_sched_domains_energy(); } return ret; @@ -324,6 +328,7 @@ static void sched_energy_set(bool has_eas) * 3. no SMT is detected. * 4. the EM complexity is low enough to keep scheduling overheads low; * 5. schedutil is driving the frequency of all CPUs of the rd; + * 6. frequency invariance support is present; * * The complexity of the Energy Model is defined as: * @@ -372,6 +377,14 @@ static bool build_perf_domains(const struct cpumask *cpu_map) goto free; } + if (!arch_scale_freq_invariant()) { + if (sched_debug()) { + pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported", + cpumask_pr_args(cpu_map)); + } + goto free; + } + for_each_cpu(i, cpu_map) { /* Skip already covered CPUs. */ if (find_pd(pd, i)) @@ -516,6 +529,7 @@ static int init_rootdomain(struct root_domain *rd) init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); #endif + rd->visit_gen = 0; init_dl_bw(&rd->dl_bw); if (cpudl_init(&rd->cpudl) != 0) goto free_rto_mask; @@ -674,6 +688,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) { struct rq *rq = cpu_rq(cpu); struct sched_domain *tmp; + int numa_distance = 0; /* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { @@ -705,6 +720,38 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) sd->child = NULL; } + for (tmp = sd; tmp; tmp = tmp->parent) + numa_distance += !!(tmp->flags & SD_NUMA); + + /* + * FIXME: Diameter >=3 is misrepresented. + * + * Smallest diameter=3 topology is: + * + * node 0 1 2 3 + * 0: 10 20 30 40 + * 1: 20 10 20 30 + * 2: 30 20 10 20 + * 3: 40 30 20 10 + * + * 0 --- 1 --- 2 --- 3 + * + * NUMA-3 0-3 N/A N/A 0-3 + * groups: {0-2},{1-3} {1-3},{0-2} + * + * NUMA-2 0-2 0-3 0-3 1-3 + * groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2} + * + * NUMA-1 0-1 0-2 1-3 2-3 + * groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2} + * + * NUMA-0 0 1 2 3 + * + * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the + * group span isn't a subset of the domain span. + */ + WARN_ONCE(numa_distance > 2, "Shortest NUMA path spans too many nodes\n"); + sched_domain_debug(sd, cpu); rq_attach_root(rq, rd); diff --git a/kernel/scs.c b/kernel/scs.c index 4ff4a7ba0094..e2a71fc82fa0 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -5,26 +5,49 @@ * Copyright (C) 2019 Google LLC */ +#include <linux/cpuhotplug.h> #include <linux/kasan.h> #include <linux/mm.h> #include <linux/scs.h> -#include <linux/slab.h> +#include <linux/vmalloc.h> #include <linux/vmstat.h> -static struct kmem_cache *scs_cache; - static void __scs_account(void *s, int account) { - struct page *scs_page = virt_to_page(s); + struct page *scs_page = vmalloc_to_page(s); mod_node_page_state(page_pgdat(scs_page), NR_KERNEL_SCS_KB, account * (SCS_SIZE / SZ_1K)); } -static void *scs_alloc(int node) +/* Matches NR_CACHED_STACKS for VMAP_STACK */ +#define NR_CACHED_SCS 2 +static DEFINE_PER_CPU(void *, scs_cache[NR_CACHED_SCS]); + +static void *__scs_alloc(int node) { - void *s = kmem_cache_alloc_node(scs_cache, GFP_SCS, node); + int i; + void *s; + + for (i = 0; i < NR_CACHED_SCS; i++) { + s = this_cpu_xchg(scs_cache[i], NULL); + if (s) { + kasan_unpoison_vmalloc(s, SCS_SIZE); + memset(s, 0, SCS_SIZE); + return s; + } + } + + return __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END, + GFP_SCS, PAGE_KERNEL, 0, node, + __builtin_return_address(0)); +} +void *scs_alloc(int node) +{ + void *s; + + s = __scs_alloc(node); if (!s) return NULL; @@ -34,21 +57,47 @@ static void *scs_alloc(int node) * Poison the allocation to catch unintentional accesses to * the shadow stack when KASAN is enabled. */ - kasan_poison_object_data(scs_cache, s); + kasan_poison_vmalloc(s, SCS_SIZE); __scs_account(s, 1); return s; } -static void scs_free(void *s) +void scs_free(void *s) { + int i; + __scs_account(s, -1); - kasan_unpoison_object_data(scs_cache, s); - kmem_cache_free(scs_cache, s); + + /* + * We cannot sleep as this can be called in interrupt context, + * so use this_cpu_cmpxchg to update the cache, and vfree_atomic + * to free the stack. + */ + + for (i = 0; i < NR_CACHED_SCS; i++) + if (this_cpu_cmpxchg(scs_cache[i], 0, s) == NULL) + return; + + vfree_atomic(s); +} + +static int scs_cleanup(unsigned int cpu) +{ + int i; + void **cache = per_cpu_ptr(scs_cache, cpu); + + for (i = 0; i < NR_CACHED_SCS; i++) { + vfree(cache[i]); + cache[i] = NULL; + } + + return 0; } void __init scs_init(void) { - scs_cache = kmem_cache_create("scs_cache", SCS_SIZE, 0, 0, NULL); + cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "scs:scs_cache", NULL, + scs_cleanup); } int scs_prepare(struct task_struct *tsk, int node) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 8ad7a293255a..15f47fc11d13 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -38,7 +38,7 @@ #include <linux/filter.h> #include <linux/pid.h> #include <linux/ptrace.h> -#include <linux/security.h> +#include <linux/capability.h> #include <linux/tracehook.h> #include <linux/uaccess.h> #include <linux/anon_inodes.h> @@ -356,14 +356,14 @@ static inline void seccomp_assign_mode(struct task_struct *task, task->seccomp.mode = seccomp_mode; /* - * Make sure TIF_SECCOMP cannot be set before the mode (and + * Make sure SYSCALL_WORK_SECCOMP cannot be set before the mode (and * filter) is set. */ smp_mb__before_atomic(); /* Assume default seccomp processes want spec flaw mitigation. */ if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0) arch_seccomp_spec_mitigate(task); - set_tsk_thread_flag(task, TIF_SECCOMP); + set_task_syscall_work(task, SECCOMP); } #ifdef CONFIG_SECCOMP_FILTER @@ -558,8 +558,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) * behavior of privileged children. */ if (!task_no_new_privs(current) && - security_capable(current_cred(), current_user_ns(), - CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) != 0) + !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); /* Allocate a new seccomp_filter */ @@ -929,7 +928,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, /* * Make sure that any changes to mode from another thread have - * been seen after TIF_SECCOMP was seen. + * been seen after SYSCALL_WORK_SECCOMP was seen. */ rmb(); diff --git a/kernel/signal.c b/kernel/signal.c index a38b3edc6851..c37170655171 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -391,16 +391,17 @@ static bool task_participate_group_stop(struct task_struct *task) void task_join_group_stop(struct task_struct *task) { + unsigned long mask = current->jobctl & JOBCTL_STOP_SIGMASK; + struct signal_struct *sig = current->signal; + + if (sig->group_stop_count) { + sig->group_stop_count++; + mask |= JOBCTL_STOP_CONSUME; + } else if (!(sig->flags & SIGNAL_STOP_STOPPED)) + return; + /* Have the new thread join an on-going signal group stop */ - unsigned long jobctl = current->jobctl; - if (jobctl & JOBCTL_STOP_PENDING) { - struct signal_struct *sig = current->signal; - unsigned long signr = jobctl & JOBCTL_STOP_SIGMASK; - unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; - if (task_set_jobctl_pending(task, signr | gstop)) { - sig->group_stop_count++; - } - } + task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING); } /* @@ -983,7 +984,7 @@ static inline bool wants_signal(int sig, struct task_struct *p) if (task_is_stopped_or_traced(p)) return false; - return task_curr(p) || !signal_pending(p); + return task_curr(p) || !task_sigpending(p); } static void complete_signal(int sig, struct task_struct *p, enum pid_type type) @@ -2523,12 +2524,46 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info) return signr; } +static void hide_si_addr_tag_bits(struct ksignal *ksig) +{ + switch (siginfo_layout(ksig->sig, ksig->info.si_code)) { + case SIL_FAULT: + case SIL_FAULT_MCEERR: + case SIL_FAULT_BNDERR: + case SIL_FAULT_PKUERR: + ksig->info.si_addr = arch_untagged_si_addr( + ksig->info.si_addr, ksig->sig, ksig->info.si_code); + break; + case SIL_KILL: + case SIL_TIMER: + case SIL_POLL: + case SIL_CHLD: + case SIL_RT: + case SIL_SYS: + break; + } +} + bool get_signal(struct ksignal *ksig) { struct sighand_struct *sighand = current->sighand; struct signal_struct *signal = current->signal; int signr; + /* + * For non-generic architectures, check for TIF_NOTIFY_SIGNAL so + * that the arch handlers don't all have to do it. If we get here + * without TIF_SIGPENDING, just exit after running signal work. + */ +#ifdef TIF_NOTIFY_SIGNAL + if (!IS_ENABLED(CONFIG_GENERIC_ENTRY)) { + if (test_thread_flag(TIF_NOTIFY_SIGNAL)) + tracehook_notify_signal(); + if (!task_sigpending(current)) + return false; + } +#endif + if (unlikely(uprobe_deny_signal())) return false; @@ -2760,6 +2795,10 @@ relock: spin_unlock_irq(&sighand->siglock); ksig->sig = signr; + + if (!(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS)) + hide_si_addr_tag_bits(ksig); + return ksig->sig > 0; } @@ -2822,7 +2861,7 @@ static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which) /* Remove the signals this thread can handle. */ sigandsets(&retarget, &retarget, &t->blocked); - if (!signal_pending(t)) + if (!task_sigpending(t)) signal_wake_up(t, 0); if (sigisemptyset(&retarget)) @@ -2856,7 +2895,7 @@ void exit_signals(struct task_struct *tsk) cgroup_threadgroup_change_end(tsk); - if (!signal_pending(tsk)) + if (!task_sigpending(tsk)) goto out; unblocked = tsk->blocked; @@ -2900,7 +2939,7 @@ long do_no_restart_syscall(struct restart_block *param) static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset) { - if (signal_pending(tsk) && !thread_group_empty(tsk)) { + if (task_sigpending(tsk) && !thread_group_empty(tsk)) { sigset_t newblocked; /* A set of now blocked but previously unblocked signals. */ sigandnsets(&newblocked, newset, ¤t->blocked); @@ -3984,6 +4023,22 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) if (oact) *oact = *k; + /* + * Make sure that we never accidentally claim to support SA_UNSUPPORTED, + * e.g. by having an architecture use the bit in their uapi. + */ + BUILD_BUG_ON(UAPI_SA_FLAGS & SA_UNSUPPORTED); + + /* + * Clear unknown flag bits in order to allow userspace to detect missing + * support for flag bits and to allow the kernel to use non-uapi bits + * internally. + */ + if (act) + act->sa.sa_flags &= UAPI_SA_FLAGS; + if (oact) + oact->sa.sa_flags &= UAPI_SA_FLAGS; + sigaction_compat_abi(act, oact); if (act) { diff --git a/kernel/smp.c b/kernel/smp.c index 4d17501433be..1b6070bf97bb 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -27,7 +27,7 @@ #include "smpboot.h" #include "sched/smp.h" -#define CSD_TYPE(_csd) ((_csd)->flags & CSD_FLAG_TYPE_MASK) +#define CSD_TYPE(_csd) ((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK) struct call_function_data { call_single_data_t __percpu *csd; @@ -130,7 +130,7 @@ static __always_inline int csd_lock_wait_getcpu(call_single_data_t *csd) csd_type = CSD_TYPE(csd); if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC) - return csd->dst; /* Other CSD_TYPE_ values might not have ->dst. */ + return csd->node.dst; /* Other CSD_TYPE_ values might not have ->dst. */ return -1; } @@ -146,7 +146,7 @@ static __always_inline bool csd_lock_wait_toolong(call_single_data_t *csd, u64 t bool firsttime; u64 ts2, ts_delta; call_single_data_t *cpu_cur_csd; - unsigned int flags = READ_ONCE(csd->flags); + unsigned int flags = READ_ONCE(csd->node.u_flags); if (!(flags & CSD_FLAG_LOCK)) { if (!unlikely(*bug_id)) @@ -224,14 +224,14 @@ static void csd_lock_record(call_single_data_t *csd) static __always_inline void csd_lock_wait(call_single_data_t *csd) { - smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK)); + smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); } #endif static __always_inline void csd_lock(call_single_data_t *csd) { csd_lock_wait(csd); - csd->flags |= CSD_FLAG_LOCK; + csd->node.u_flags |= CSD_FLAG_LOCK; /* * prevent CPU from reordering the above assignment @@ -243,12 +243,12 @@ static __always_inline void csd_lock(call_single_data_t *csd) static __always_inline void csd_unlock(call_single_data_t *csd) { - WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); + WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK)); /* * ensure we're all done before releasing data: */ - smp_store_release(&csd->flags, 0); + smp_store_release(&csd->node.u_flags, 0); } static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); @@ -300,7 +300,7 @@ static int generic_exec_single(int cpu, call_single_data_t *csd) return -ENXIO; } - __smp_call_single_queue(cpu, &csd->llist); + __smp_call_single_queue(cpu, &csd->node.llist); return 0; } @@ -353,7 +353,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) * We don't have to use the _safe() variant here * because we are not invoking the IPI handlers yet. */ - llist_for_each_entry(csd, entry, llist) { + llist_for_each_entry(csd, entry, node.llist) { switch (CSD_TYPE(csd)) { case CSD_TYPE_ASYNC: case CSD_TYPE_SYNC: @@ -378,16 +378,16 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) * First; run all SYNC callbacks, people are waiting for us. */ prev = NULL; - llist_for_each_entry_safe(csd, csd_next, entry, llist) { + llist_for_each_entry_safe(csd, csd_next, entry, node.llist) { /* Do we wait until *after* callback? */ if (CSD_TYPE(csd) == CSD_TYPE_SYNC) { smp_call_func_t func = csd->func; void *info = csd->info; if (prev) { - prev->next = &csd_next->llist; + prev->next = &csd_next->node.llist; } else { - entry = &csd_next->llist; + entry = &csd_next->node.llist; } csd_lock_record(csd); @@ -395,7 +395,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) csd_unlock(csd); csd_lock_record(NULL); } else { - prev = &csd->llist; + prev = &csd->node.llist; } } @@ -406,14 +406,14 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) * Second; run all !SYNC callbacks. */ prev = NULL; - llist_for_each_entry_safe(csd, csd_next, entry, llist) { + llist_for_each_entry_safe(csd, csd_next, entry, node.llist) { int type = CSD_TYPE(csd); if (type != CSD_TYPE_TTWU) { if (prev) { - prev->next = &csd_next->llist; + prev->next = &csd_next->node.llist; } else { - entry = &csd_next->llist; + entry = &csd_next->node.llist; } if (type == CSD_TYPE_ASYNC) { @@ -429,7 +429,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) } } else { - prev = &csd->llist; + prev = &csd->node.llist; } } @@ -465,7 +465,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, { call_single_data_t *csd; call_single_data_t csd_stack = { - .flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, + .node = { .u_flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, }, }; int this_cpu; int err; @@ -502,8 +502,8 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, csd->func = func; csd->info = info; #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG - csd->src = smp_processor_id(); - csd->dst = cpu; + csd->node.src = smp_processor_id(); + csd->node.dst = cpu; #endif err = generic_exec_single(cpu, csd); @@ -544,12 +544,12 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd) preempt_disable(); - if (csd->flags & CSD_FLAG_LOCK) { + if (csd->node.u_flags & CSD_FLAG_LOCK) { err = -EBUSY; goto out; } - csd->flags = CSD_FLAG_LOCK; + csd->node.u_flags = CSD_FLAG_LOCK; smp_wmb(); err = generic_exec_single(cpu, csd); @@ -667,14 +667,14 @@ static void smp_call_function_many_cond(const struct cpumask *mask, csd_lock(csd); if (wait) - csd->flags |= CSD_TYPE_SYNC; + csd->node.u_flags |= CSD_TYPE_SYNC; csd->func = func; csd->info = info; #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG - csd->src = smp_processor_id(); - csd->dst = cpu; + csd->node.src = smp_processor_id(); + csd->node.dst = cpu; #endif - if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) + if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) __cpumask_set_cpu(cpu, cfd->cpumask_ipi); } diff --git a/kernel/softirq.c b/kernel/softirq.c index 09229ad82209..d5bfd5e661fc 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -92,6 +92,13 @@ static bool ksoftirqd_running(unsigned long pending) !__kthread_should_park(tsk); } +#ifdef CONFIG_TRACE_IRQFLAGS +DEFINE_PER_CPU(int, hardirqs_enabled); +DEFINE_PER_CPU(int, hardirq_context); +EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled); +EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context); +#endif + /* * preempt_count and SOFTIRQ_OFFSET usage: * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving @@ -102,17 +109,11 @@ static bool ksoftirqd_running(unsigned long pending) * softirq and whether we just have bh disabled. */ +#ifdef CONFIG_TRACE_IRQFLAGS /* - * This one is for softirq.c-internal use, - * where hardirqs are disabled legitimately: + * This is for softirq.c-internal use, where hardirqs are disabled + * legitimately: */ -#ifdef CONFIG_TRACE_IRQFLAGS - -DEFINE_PER_CPU(int, hardirqs_enabled); -DEFINE_PER_CPU(int, hardirq_context); -EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled); -EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context); - void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) { unsigned long flags; @@ -203,6 +204,50 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) } EXPORT_SYMBOL(__local_bh_enable_ip); +static inline void invoke_softirq(void) +{ + if (ksoftirqd_running(local_softirq_pending())) + return; + + if (!force_irqthreads) { +#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK + /* + * We can safely execute softirq on the current stack if + * it is the irq stack, because it should be near empty + * at this stage. + */ + __do_softirq(); +#else + /* + * Otherwise, irq_exit() is called on the task stack that can + * be potentially deep already. So call softirq in its own stack + * to prevent from any overrun. + */ + do_softirq_own_stack(); +#endif + } else { + wakeup_softirqd(); + } +} + +asmlinkage __visible void do_softirq(void) +{ + __u32 pending; + unsigned long flags; + + if (in_interrupt()) + return; + + local_irq_save(flags); + + pending = local_softirq_pending(); + + if (pending && !ksoftirqd_running(pending)) + do_softirq_own_stack(); + + local_irq_restore(flags); +} + /* * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, * but break the loop if need_resched() is set or after 2 ms. @@ -270,10 +315,10 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) current->flags &= ~PF_MEMALLOC; pending = local_softirq_pending(); - account_irq_enter_time(current); __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); in_hardirq = lockdep_softirq_start(); + account_softirq_enter(current); restart: /* Reset the pending bitmask before enabling irqs */ @@ -320,46 +365,24 @@ restart: wakeup_softirqd(); } + account_softirq_exit(current); lockdep_softirq_end(in_hardirq); - account_irq_exit_time(current); __local_bh_enable(SOFTIRQ_OFFSET); WARN_ON_ONCE(in_interrupt()); current_restore_flags(old_flags, PF_MEMALLOC); } -asmlinkage __visible void do_softirq(void) -{ - __u32 pending; - unsigned long flags; - - if (in_interrupt()) - return; - - local_irq_save(flags); - - pending = local_softirq_pending(); - - if (pending && !ksoftirqd_running(pending)) - do_softirq_own_stack(); - - local_irq_restore(flags); -} - /** * irq_enter_rcu - Enter an interrupt context with RCU watching */ void irq_enter_rcu(void) { - if (is_idle_task(current) && !in_interrupt()) { - /* - * Prevent raise_softirq from needlessly waking up ksoftirqd - * here, as softirq will be serviced on return from interrupt. - */ - local_bh_disable(); + __irq_enter_raw(); + + if (is_idle_task(current) && (irq_count() == HARDIRQ_OFFSET)) tick_irq_enter(); - _local_bh_enable(); - } - __irq_enter(); + + account_hardirq_enter(current); } /** @@ -371,32 +394,6 @@ void irq_enter(void) irq_enter_rcu(); } -static inline void invoke_softirq(void) -{ - if (ksoftirqd_running(local_softirq_pending())) - return; - - if (!force_irqthreads) { -#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK - /* - * We can safely execute softirq on the current stack if - * it is the irq stack, because it should be near empty - * at this stage. - */ - __do_softirq(); -#else - /* - * Otherwise, irq_exit() is called on the task stack that can - * be potentially deep already. So call softirq in its own stack - * to prevent from any overrun. - */ - do_softirq_own_stack(); -#endif - } else { - wakeup_softirqd(); - } -} - static inline void tick_irq_exit(void) { #ifdef CONFIG_NO_HZ_COMMON @@ -417,7 +414,7 @@ static inline void __irq_exit_rcu(void) #else lockdep_assert_irqs_disabled(); #endif - account_irq_exit_time(current); + account_hardirq_exit(current); preempt_count_sub(HARDIRQ_OFFSET); if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 865bb0228ab6..971d8acceaec 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -42,11 +42,27 @@ struct cpu_stopper { struct list_head works; /* list of pending works */ struct cpu_stop_work stop_work; /* for stop_cpus */ + unsigned long caller; + cpu_stop_fn_t fn; }; static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); static bool stop_machine_initialized = false; +void print_stop_info(const char *log_lvl, struct task_struct *task) +{ + /* + * If @task is a stopper task, it cannot migrate and task_cpu() is + * stable. + */ + struct cpu_stopper *stopper = per_cpu_ptr(&cpu_stopper, task_cpu(task)); + + if (task != stopper->thread) + return; + + printk("%sStopper: %pS <- %pS\n", log_lvl, stopper->fn, (void *)stopper->caller); +} + /* static data for stop_cpus */ static DEFINE_MUTEX(stop_cpus_mutex); static bool stop_cpus_in_progress; @@ -123,7 +139,7 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) { struct cpu_stop_done done; - struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; + struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ }; cpu_stop_init_done(&done, 1); if (!cpu_stop_queue_work(cpu, &work)) @@ -178,7 +194,7 @@ static void ack_state(struct multi_stop_data *msdata) set_state(msdata, msdata->state + 1); } -void __weak stop_machine_yield(const struct cpumask *cpumask) +notrace void __weak stop_machine_yield(const struct cpumask *cpumask) { cpu_relax(); } @@ -331,7 +347,8 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * work1 = work2 = (struct cpu_stop_work){ .fn = multi_cpu_stop, .arg = &msdata, - .done = &done + .done = &done, + .caller = _RET_IP_, }; cpu_stop_init_done(&done, 2); @@ -367,7 +384,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf) { - *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; + *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, }; return cpu_stop_queue_work(cpu, work_buf); } @@ -487,6 +504,8 @@ repeat: int ret; /* cpu stop callbacks must not sleep, make in_atomic() == T */ + stopper->caller = work->caller; + stopper->fn = fn; preempt_count_inc(); ret = fn(arg); if (done) { @@ -495,6 +514,8 @@ repeat: cpu_stop_signal_done(done); } preempt_count_dec(); + stopper->fn = NULL; + stopper->caller = 0; WARN_ONCE(preempt_count(), "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg); goto repeat; diff --git a/kernel/sys.c b/kernel/sys.c index a730c03ee607..51f00fe20e4d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -42,6 +42,7 @@ #include <linux/syscore_ops.h> #include <linux/version.h> #include <linux/ctype.h> +#include <linux/syscall_user_dispatch.h> #include <linux/compat.h> #include <linux/syscalls.h> @@ -2530,6 +2531,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER; break; + case PR_SET_SYSCALL_USER_DISPATCH: + error = set_syscall_user_dispatch(arg2, arg3, arg4, + (char __user *) arg5); + break; default: error = -EINVAL; break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index afad085960b8..c9fbdd848138 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2650,6 +2650,17 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_ONE, }, #endif +#if defined(CONFIG_TREE_RCU) + { + .procname = "max_rcu_stall_to_panic", + .data = &sysctl_max_rcu_stall_to_panic, + .maxlen = sizeof(sysctl_max_rcu_stall_to_panic), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, + }, +#endif #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE { .procname = "stack_erasing", diff --git a/kernel/task_work.c b/kernel/task_work.c index 8d6e1217c451..15b087286bea 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -5,6 +5,34 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ +/* + * TWA_SIGNAL signaling - use TIF_NOTIFY_SIGNAL, if available, as it's faster + * than TIF_SIGPENDING as there's no dependency on ->sighand. The latter is + * shared for threads, and can cause contention on sighand->lock. Even for + * the non-threaded case TIF_NOTIFY_SIGNAL is more efficient, as no locking + * or IRQ disabling is involved for notification (or running) purposes. + */ +static void task_work_notify_signal(struct task_struct *task) +{ +#if defined(TIF_NOTIFY_SIGNAL) + set_notify_signal(task); +#else + unsigned long flags; + + /* + * Only grab the sighand lock if we don't already have some + * task_work pending. This pairs with the smp_store_mb() + * in get_signal(), see comment there. + */ + if (!(READ_ONCE(task->jobctl) & JOBCTL_TASK_WORK) && + lock_task_sighand(task, &flags)) { + task->jobctl |= JOBCTL_TASK_WORK; + signal_wake_up(task, 0); + unlock_task_sighand(task, &flags); + } +#endif +} + /** * task_work_add - ask the @task to execute @work->func() * @task: the task which should run the callback @@ -33,7 +61,6 @@ int task_work_add(struct task_struct *task, struct callback_head *work, enum task_work_notify_mode notify) { struct callback_head *head; - unsigned long flags; do { head = READ_ONCE(task->task_works); @@ -49,17 +76,7 @@ int task_work_add(struct task_struct *task, struct callback_head *work, set_notify_resume(task); break; case TWA_SIGNAL: - /* - * Only grab the sighand lock if we don't already have some - * task_work pending. This pairs with the smp_store_mb() - * in get_signal(), see comment there. - */ - if (!(READ_ONCE(task->jobctl) & JOBCTL_TASK_WORK) && - lock_task_sighand(task, &flags)) { - task->jobctl |= JOBCTL_TASK_WORK; - signal_wake_up(task, 0); - unlock_task_sighand(task, &flags); - } + task_work_notify_signal(task); break; default: WARN_ON_ONCE(1); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index a2802b6ff4bb..2b4898b4752e 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -346,7 +346,7 @@ static int parse(struct nlattr *na, struct cpumask *mask) data = kmalloc(len, GFP_KERNEL); if (!data) return -ENOMEM; - nla_strlcpy(data, na, len); + nla_strscpy(data, na, len); ret = cpulist_parse(data, mask); kfree(data); return ret; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 3624b9b5835d..743c852e10f2 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -425,11 +425,6 @@ static inline void debug_hrtimer_deactivate(struct hrtimer *timer) debug_object_deactivate(timer, &hrtimer_debug_descr); } -static inline void debug_hrtimer_free(struct hrtimer *timer) -{ - debug_object_free(timer, &hrtimer_debug_descr); -} - static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode); @@ -1289,7 +1284,7 @@ int hrtimer_cancel(struct hrtimer *timer) EXPORT_SYMBOL_GPL(hrtimer_cancel); /** - * hrtimer_get_remaining - get remaining time for the timer + * __hrtimer_get_remaining - get remaining time for the timer * @timer: the timer to read * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y */ diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index ca4e6d57d68b..00629e658ca1 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -172,10 +172,6 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, u64 oval, nval, ointerval, ninterval; struct cpu_itimer *it = &tsk->signal->it[clock_id]; - /* - * Use the to_ktime conversion because that clamps the maximum - * value to KTIME_MAX and avoid multiplication overflows. - */ nval = timespec64_to_ns(&value->it_value); ninterval = timespec64_to_ns(&value->it_interval); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index eddcf4970444..a5cffe2a1770 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -59,7 +59,8 @@ static struct clocksource clocksource_jiffies = { }; __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock); -__cacheline_aligned_in_smp seqcount_t jiffies_seq; +__cacheline_aligned_in_smp seqcount_raw_spinlock_t jiffies_seq = + SEQCNT_RAW_SPINLOCK_ZERO(jiffies_seq, &jiffies_lock); #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index afc65e6be33e..6ca625f5e554 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -92,7 +92,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, if (!ns) goto fail_dec; - kref_init(&ns->kref); + refcount_set(&ns->ns.count, 1); ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!ns->vvar_page) @@ -226,11 +226,8 @@ out: mutex_unlock(&offset_lock); } -void free_time_ns(struct kref *kref) +void free_time_ns(struct time_namespace *ns) { - struct time_namespace *ns; - - ns = container_of(kref, struct time_namespace, kref); dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); @@ -308,22 +305,20 @@ static int timens_install(struct nsset *nsset, struct ns_common *new) return 0; } -int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) +void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) { struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; struct time_namespace *ns = to_time_ns(nsc); /* create_new_namespaces() already incremented the ref counter */ if (nsproxy->time_ns == nsproxy->time_ns_for_children) - return 0; + return; get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; timens_commit(tsk, ns); - - return 0; } static struct user_namespace *timens_owner(struct ns_common *ns) @@ -464,7 +459,7 @@ const struct proc_ns_operations timens_for_children_operations = { }; struct time_namespace init_time_ns = { - .kref = KREF_INIT(3), + .ns.count = REFCOUNT_INIT(3), .user_ns = &init_user_ns, .ns.inum = PROC_TIME_INIT_INO, .ns.ops = &timens_operations, diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 069ca78fb0bf..7404d3831527 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -494,65 +494,74 @@ out: return leap; } +#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) static void sync_hw_clock(struct work_struct *work); -static DECLARE_DELAYED_WORK(sync_work, sync_hw_clock); - -static void sched_sync_hw_clock(struct timespec64 now, - unsigned long target_nsec, bool fail) +static DECLARE_WORK(sync_work, sync_hw_clock); +static struct hrtimer sync_hrtimer; +#define SYNC_PERIOD_NS (11UL * 60 * NSEC_PER_SEC) +static enum hrtimer_restart sync_timer_callback(struct hrtimer *timer) { - struct timespec64 next; - - ktime_get_real_ts64(&next); - if (!fail) - next.tv_sec = 659; - else { - /* - * Try again as soon as possible. Delaying long periods - * decreases the accuracy of the work queue timer. Due to this - * the algorithm is very likely to require a short-sleep retry - * after the above long sleep to synchronize ts_nsec. - */ - next.tv_sec = 0; - } - - /* Compute the needed delay that will get to tv_nsec == target_nsec */ - next.tv_nsec = target_nsec - next.tv_nsec; - if (next.tv_nsec <= 0) - next.tv_nsec += NSEC_PER_SEC; - if (next.tv_nsec >= NSEC_PER_SEC) { - next.tv_sec++; - next.tv_nsec -= NSEC_PER_SEC; - } + queue_work(system_power_efficient_wq, &sync_work); - queue_delayed_work(system_power_efficient_wq, &sync_work, - timespec64_to_jiffies(&next)); + return HRTIMER_NORESTART; } -static void sync_rtc_clock(void) +static void sched_sync_hw_clock(unsigned long offset_nsec, bool retry) { - unsigned long target_nsec; - struct timespec64 adjust, now; - int rc; + ktime_t exp = ktime_set(ktime_get_real_seconds(), 0); - if (!IS_ENABLED(CONFIG_RTC_SYSTOHC)) - return; + if (retry) + exp = ktime_add_ns(exp, 2 * NSEC_PER_SEC - offset_nsec); + else + exp = ktime_add_ns(exp, SYNC_PERIOD_NS - offset_nsec); - ktime_get_real_ts64(&now); + hrtimer_start(&sync_hrtimer, exp, HRTIMER_MODE_ABS); +} - adjust = now; - if (persistent_clock_is_local) - adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); +/* + * Check whether @now is correct versus the required time to update the RTC + * and calculate the value which needs to be written to the RTC so that the + * next seconds increment of the RTC after the write is aligned with the next + * seconds increment of clock REALTIME. + * + * tsched t1 write(t2.tv_sec - 1sec)) t2 RTC increments seconds + * + * t2.tv_nsec == 0 + * tsched = t2 - set_offset_nsec + * newval = t2 - NSEC_PER_SEC + * + * ==> neval = tsched + set_offset_nsec - NSEC_PER_SEC + * + * As the execution of this code is not guaranteed to happen exactly at + * tsched this allows it to happen within a fuzzy region: + * + * abs(now - tsched) < FUZZ + * + * If @now is not inside the allowed window the function returns false. + */ +static inline bool rtc_tv_nsec_ok(unsigned long set_offset_nsec, + struct timespec64 *to_set, + const struct timespec64 *now) +{ + /* Allowed error in tv_nsec, arbitarily set to 5 jiffies in ns. */ + const unsigned long TIME_SET_NSEC_FUZZ = TICK_NSEC * 5; + struct timespec64 delay = {.tv_sec = -1, + .tv_nsec = set_offset_nsec}; - /* - * The current RTC in use will provide the target_nsec it wants to be - * called at, and does rtc_tv_nsec_ok internally. - */ - rc = rtc_set_ntp_time(adjust, &target_nsec); - if (rc == -ENODEV) - return; + *to_set = timespec64_add(*now, delay); + + if (to_set->tv_nsec < TIME_SET_NSEC_FUZZ) { + to_set->tv_nsec = 0; + return true; + } - sched_sync_hw_clock(now, target_nsec, rc); + if (to_set->tv_nsec > NSEC_PER_SEC - TIME_SET_NSEC_FUZZ) { + to_set->tv_sec++; + to_set->tv_nsec = 0; + return true; + } + return false; } #ifdef CONFIG_GENERIC_CMOS_UPDATE @@ -560,48 +569,47 @@ int __weak update_persistent_clock64(struct timespec64 now64) { return -ENODEV; } +#else +static inline int update_persistent_clock64(struct timespec64 now64) +{ + return -ENODEV; +} #endif -static bool sync_cmos_clock(void) +#ifdef CONFIG_RTC_SYSTOHC +/* Save NTP synchronized time to the RTC */ +static int update_rtc(struct timespec64 *to_set, unsigned long *offset_nsec) { - static bool no_cmos; - struct timespec64 now; - struct timespec64 adjust; - int rc = -EPROTO; - long target_nsec = NSEC_PER_SEC / 2; + struct rtc_device *rtc; + struct rtc_time tm; + int err = -ENODEV; - if (!IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE)) - return false; + rtc = rtc_class_open(CONFIG_RTC_SYSTOHC_DEVICE); + if (!rtc) + return -ENODEV; - if (no_cmos) - return false; + if (!rtc->ops || !rtc->ops->set_time) + goto out_close; - /* - * Historically update_persistent_clock64() has followed x86 - * semantics, which match the MC146818A/etc RTC. This RTC will store - * 'adjust' and then in .5s it will advance once second. - * - * Architectures are strongly encouraged to use rtclib and not - * implement this legacy API. - */ - ktime_get_real_ts64(&now); - if (rtc_tv_nsec_ok(-1 * target_nsec, &adjust, &now)) { - if (persistent_clock_is_local) - adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); - rc = update_persistent_clock64(adjust); - /* - * The machine does not support update_persistent_clock64 even - * though it defines CONFIG_GENERIC_CMOS_UPDATE. - */ - if (rc == -ENODEV) { - no_cmos = true; - return false; - } + /* First call might not have the correct offset */ + if (*offset_nsec == rtc->set_offset_nsec) { + rtc_time64_to_tm(to_set->tv_sec, &tm); + err = rtc_set_time(rtc, &tm); + } else { + /* Store the update offset and let the caller try again */ + *offset_nsec = rtc->set_offset_nsec; + err = -EAGAIN; } - - sched_sync_hw_clock(now, target_nsec, rc); - return true; +out_close: + rtc_class_close(rtc); + return err; +} +#else +static inline int update_rtc(struct timespec64 *to_set, unsigned long *offset_nsec) +{ + return -ENODEV; } +#endif /* * If we have an externally synchronized Linux clock, then update RTC clock @@ -613,24 +621,64 @@ static bool sync_cmos_clock(void) */ static void sync_hw_clock(struct work_struct *work) { - if (!ntp_synced()) - return; + /* + * The default synchronization offset is 500ms for the deprecated + * update_persistent_clock64() under the assumption that it uses + * the infamous CMOS clock (MC146818). + */ + static unsigned long offset_nsec = NSEC_PER_SEC / 2; + struct timespec64 now, to_set; + int res = -EAGAIN; - if (sync_cmos_clock()) + /* + * Don't update if STA_UNSYNC is set and if ntp_notify_cmos_timer() + * managed to schedule the work between the timer firing and the + * work being able to rearm the timer. Wait for the timer to expire. + */ + if (!ntp_synced() || hrtimer_is_queued(&sync_hrtimer)) return; - sync_rtc_clock(); + ktime_get_real_ts64(&now); + /* If @now is not in the allowed window, try again */ + if (!rtc_tv_nsec_ok(offset_nsec, &to_set, &now)) + goto rearm; + + /* Take timezone adjusted RTCs into account */ + if (persistent_clock_is_local) + to_set.tv_sec -= (sys_tz.tz_minuteswest * 60); + + /* Try the legacy RTC first. */ + res = update_persistent_clock64(to_set); + if (res != -ENODEV) + goto rearm; + + /* Try the RTC class */ + res = update_rtc(&to_set, &offset_nsec); + if (res == -ENODEV) + return; +rearm: + sched_sync_hw_clock(offset_nsec, res != 0); } void ntp_notify_cmos_timer(void) { - if (!ntp_synced()) - return; + /* + * When the work is currently executed but has not yet the timer + * rearmed this queues the work immediately again. No big issue, + * just a pointless work scheduled. + */ + if (ntp_synced() && !hrtimer_is_queued(&sync_hrtimer)) + queue_work(system_power_efficient_wq, &sync_work); +} - if (IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE) || - IS_ENABLED(CONFIG_RTC_SYSTOHC)) - queue_delayed_work(system_power_efficient_wq, &sync_work, 0); +static void __init ntp_init_cmos_sync(void) +{ + hrtimer_init(&sync_hrtimer, CLOCK_REALTIME, HRTIMER_MODE_ABS); + sync_hrtimer.function = sync_timer_callback; } +#else /* CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */ +static inline void __init ntp_init_cmos_sync(void) { } +#endif /* !CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */ /* * Propagate a new txc->status value into the NTP state: @@ -1044,4 +1092,5 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup); void __init ntp_init(void) { ntp_clear(); + ntp_init_cmos_sync(); } diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 908ecaa65fc3..23d1b74c3065 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -12,4 +12,11 @@ extern int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, s32 *time_tai, struct audit_ntp_data *ad); extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts); + +#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) +extern void ntp_notify_cmos_timer(void); +#else +static inline void ntp_notify_cmos_timer(void) { } +#endif + #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 0642013dace4..b1b9b12899f5 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -68,13 +68,13 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) return (cyc * mult) >> shift; } -struct clock_read_data *sched_clock_read_begin(unsigned int *seq) +notrace struct clock_read_data *sched_clock_read_begin(unsigned int *seq) { *seq = raw_read_seqcount_latch(&cd.seq); return cd.read_data + (*seq & 1); } -int sched_clock_read_retry(unsigned int seq) +notrace int sched_clock_read_retry(unsigned int seq) { return read_seqcount_latch_retry(&cd.seq, seq); } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 36d7464c8962..5a23829372c7 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -331,7 +331,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) bc_local = tick_do_periodic_broadcast(); if (clockevent_state_oneshot(dev)) { - ktime_t next = ktime_add(dev->next_event, tick_period); + ktime_t next = ktime_add_ns(dev->next_event, TICK_NSEC); clockevents_program_event(dev, next, true); } @@ -877,6 +877,22 @@ static void tick_broadcast_init_next_event(struct cpumask *mask, } } +static inline ktime_t tick_get_next_period(void) +{ + ktime_t next; + + /* + * Protect against concurrent updates (store /load tearing on + * 32bit). It does not matter if the time is already in the + * past. The broadcast device which is about to be programmed will + * fire in any case. + */ + raw_spin_lock(&jiffies_lock); + next = tick_next_period; + raw_spin_unlock(&jiffies_lock); + return next; +} + /** * tick_broadcast_setup_oneshot - setup the broadcast device */ @@ -905,10 +921,11 @@ static void tick_broadcast_setup_oneshot(struct clock_event_device *bc) tick_broadcast_oneshot_mask, tmpmask); if (was_periodic && !cpumask_empty(tmpmask)) { + ktime_t nextevt = tick_get_next_period(); + clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); - tick_broadcast_init_next_event(tmpmask, - tick_next_period); - tick_broadcast_set_event(bc, cpu, tick_next_period); + tick_broadcast_init_next_event(tmpmask, nextevt); + tick_broadcast_set_event(bc, cpu, nextevt); } else bc->next_event = KTIME_MAX; } else { diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 6c9c342dd0e5..a03764df5366 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -27,10 +27,11 @@ */ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); /* - * Tick next event: keeps track of the tick time + * Tick next event: keeps track of the tick time. It's updated by the + * CPU which handles the tick and protected by jiffies_lock. There is + * no requirement to write hold the jiffies seqcount for it. */ ktime_t tick_next_period; -ktime_t tick_period; /* * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR @@ -88,7 +89,7 @@ static void tick_periodic(int cpu) write_seqcount_begin(&jiffies_seq); /* Keep track of the next tick event */ - tick_next_period = ktime_add(tick_next_period, tick_period); + tick_next_period = ktime_add_ns(tick_next_period, TICK_NSEC); do_timer(1); write_seqcount_end(&jiffies_seq); @@ -127,7 +128,7 @@ void tick_handle_periodic(struct clock_event_device *dev) * Setup the next period for devices, which do not have * periodic mode: */ - next = ktime_add(next, tick_period); + next = ktime_add_ns(next, TICK_NSEC); if (!clockevents_program_event(dev, next, false)) return; @@ -173,7 +174,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) for (;;) { if (!clockevents_program_event(dev, next, false)) return; - next = ktime_add(next, tick_period); + next = ktime_add_ns(next, TICK_NSEC); } } } @@ -220,7 +221,6 @@ static void tick_setup_device(struct tick_device *td, tick_do_timer_cpu = cpu; tick_next_period = ktime_get(); - tick_period = NSEC_PER_SEC / HZ; #ifdef CONFIG_NO_HZ_FULL /* * The boot CPU may be nohz_full, in which case set diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 7b2496136729..7a981c9e87a4 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -15,7 +15,6 @@ DECLARE_PER_CPU(struct tick_device, tick_cpu_device); extern ktime_t tick_next_period; -extern ktime_t tick_period; extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 81632cd5e3b7..030282994b3e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -20,6 +20,7 @@ #include <linux/sched/clock.h> #include <linux/sched/stat.h> #include <linux/sched/nohz.h> +#include <linux/sched/loadavg.h> #include <linux/module.h> #include <linux/irq_work.h> #include <linux/posix-timers.h> @@ -44,7 +45,9 @@ struct tick_sched *tick_get_tick_sched(int cpu) #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) /* - * The time, when the last jiffy update happened. Protected by jiffies_lock. + * The time, when the last jiffy update happened. Write access must hold + * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a + * consistent view of jiffies and last_jiffies_update. */ static ktime_t last_jiffies_update; @@ -53,50 +56,97 @@ static ktime_t last_jiffies_update; */ static void tick_do_update_jiffies64(ktime_t now) { - unsigned long ticks = 0; - ktime_t delta; + unsigned long ticks = 1; + ktime_t delta, nextp; /* - * Do a quick check without holding jiffies_lock: - * The READ_ONCE() pairs with two updates done later in this function. + * 64bit can do a quick check without holding jiffies lock and + * without looking at the sequence count. The smp_load_acquire() + * pairs with the update done later in this function. + * + * 32bit cannot do that because the store of tick_next_period + * consists of two 32bit stores and the first store could move it + * to a random point in the future. */ - delta = ktime_sub(now, READ_ONCE(last_jiffies_update)); - if (delta < tick_period) - return; + if (IS_ENABLED(CONFIG_64BIT)) { + if (ktime_before(now, smp_load_acquire(&tick_next_period))) + return; + } else { + unsigned int seq; - /* Reevaluate with jiffies_lock held */ + /* + * Avoid contention on jiffies_lock and protect the quick + * check with the sequence count. + */ + do { + seq = read_seqcount_begin(&jiffies_seq); + nextp = tick_next_period; + } while (read_seqcount_retry(&jiffies_seq, seq)); + + if (ktime_before(now, nextp)) + return; + } + + /* Quick check failed, i.e. update is required. */ raw_spin_lock(&jiffies_lock); + /* + * Reevaluate with the lock held. Another CPU might have done the + * update already. + */ + if (ktime_before(now, tick_next_period)) { + raw_spin_unlock(&jiffies_lock); + return; + } + write_seqcount_begin(&jiffies_seq); - delta = ktime_sub(now, last_jiffies_update); - if (delta >= tick_period) { + delta = ktime_sub(now, tick_next_period); + if (unlikely(delta >= TICK_NSEC)) { + /* Slow path for long idle sleep times */ + s64 incr = TICK_NSEC; - delta = ktime_sub(delta, tick_period); - /* Pairs with the lockless read in this function. */ - WRITE_ONCE(last_jiffies_update, - ktime_add(last_jiffies_update, tick_period)); + ticks += ktime_divns(delta, incr); - /* Slow path for long timeouts */ - if (unlikely(delta >= tick_period)) { - s64 incr = ktime_to_ns(tick_period); + last_jiffies_update = ktime_add_ns(last_jiffies_update, + incr * ticks); + } else { + last_jiffies_update = ktime_add_ns(last_jiffies_update, + TICK_NSEC); + } - ticks = ktime_divns(delta, incr); + /* Advance jiffies to complete the jiffies_seq protected job */ + jiffies_64 += ticks; - /* Pairs with the lockless read in this function. */ - WRITE_ONCE(last_jiffies_update, - ktime_add_ns(last_jiffies_update, - incr * ticks)); - } - do_timer(++ticks); + /* + * Keep the tick_next_period variable up to date. + */ + nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC); - /* Keep the tick_next_period variable up to date */ - tick_next_period = ktime_add(last_jiffies_update, tick_period); + if (IS_ENABLED(CONFIG_64BIT)) { + /* + * Pairs with smp_load_acquire() in the lockless quick + * check above and ensures that the update to jiffies_64 is + * not reordered vs. the store to tick_next_period, neither + * by the compiler nor by the CPU. + */ + smp_store_release(&tick_next_period, nextp); } else { - write_seqcount_end(&jiffies_seq); - raw_spin_unlock(&jiffies_lock); - return; + /* + * A plain store is good enough on 32bit as the quick check + * above is protected by the sequence count. + */ + tick_next_period = nextp; } + + /* + * Release the sequence count. calc_global_load() below is not + * protected by it, but jiffies_lock needs to be held to prevent + * concurrent invocations. + */ write_seqcount_end(&jiffies_seq); + + calc_global_load(); + raw_spin_unlock(&jiffies_lock); update_wall_time(); } @@ -243,10 +293,8 @@ static void nohz_full_kick_func(struct irq_work *work) /* Empty, the tick restart happens on tick_nohz_irq_exit() */ } -static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { - .func = nohz_full_kick_func, - .flags = ATOMIC_INIT(IRQ_WORK_HARD_IRQ), -}; +static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = + IRQ_WORK_INIT_HARD(nohz_full_kick_func); /* * Kick this CPU if it's full dynticks in order to force it to @@ -661,7 +709,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) hrtimer_set_expires(&ts->sched_timer, ts->last_tick); /* Forward the time to expire in the future */ - hrtimer_forward(&ts->sched_timer, now, tick_period); + hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { hrtimer_start_expires(&ts->sched_timer, @@ -1230,7 +1278,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) if (unlikely(ts->tick_stopped)) return; - hrtimer_forward(&ts->sched_timer, now, tick_period); + hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } @@ -1267,7 +1315,7 @@ static void tick_nohz_switch_to_nohz(void) next = tick_init_jiffy_update(); hrtimer_set_expires(&ts->sched_timer, next); - hrtimer_forward_now(&ts->sched_timer, tick_period); + hrtimer_forward_now(&ts->sched_timer, TICK_NSEC); tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); tick_nohz_activate(ts, NOHZ_MODE_LOWRES); } @@ -1333,7 +1381,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) if (unlikely(ts->tick_stopped)) return HRTIMER_NORESTART; - hrtimer_forward(timer, now, tick_period); + hrtimer_forward(timer, now, TICK_NSEC); return HRTIMER_RESTART; } @@ -1367,13 +1415,13 @@ void tick_setup_sched_timer(void) /* Offset the tick to avert jiffies_lock contention. */ if (sched_skew_tick) { - u64 offset = ktime_to_ns(tick_period) >> 1; + u64 offset = TICK_NSEC >> 1; do_div(offset, num_possible_cpus()); offset *= smp_processor_id(); hrtimer_add_expires_ns(&ts->sched_timer, offset); } - hrtimer_forward(&ts->sched_timer, now, tick_period); + hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); tick_nohz_activate(ts, NOHZ_MODE_HIGHRES); } diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c index 589e0a552129..62e3b46717a6 100644 --- a/kernel/time/timeconv.c +++ b/kernel/time/timeconv.c @@ -70,10 +70,10 @@ static const unsigned short __mon_yday[2][13] = { /** * time64_to_tm - converts the calendar time to local broken-down time * - * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, + * @totalsecs: the number of seconds elapsed since 00:00:00 on January 1, 1970, * Coordinated Universal Time (UTC). - * @offset offset seconds adding to totalsecs. - * @result pointer to struct tm variable to receive broken-down time + * @offset: offset seconds adding to totalsecs. + * @result: pointer to struct tm variable to receive broken-down time */ void time64_to_tm(time64_t totalsecs, int offset, struct tm *result) { diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6858a31364b6..74503c0151e5 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -407,6 +407,7 @@ static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 c /** * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. * @tkr: Timekeeping readout base from which we take the update + * @tkf: Pointer to NMI safe timekeeper * * We want to use this from any context including NMI and tracing / * instrumenting the timekeeping code itself. @@ -436,6 +437,27 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr, memcpy(base + 1, base, sizeof(*base)); } +static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) +{ + struct tk_read_base *tkr; + unsigned int seq; + u64 now; + + do { + seq = raw_read_seqcount_latch(&tkf->seq); + tkr = tkf->base + (seq & 0x01); + now = ktime_to_ns(tkr->base); + + now += timekeeping_delta_to_ns(tkr, + clocksource_delta( + tk_clock_read(tkr), + tkr->cycle_last, + tkr->mask)); + } while (read_seqcount_latch_retry(&tkf->seq, seq)); + + return now; +} + /** * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic * @@ -462,39 +484,24 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr, * * So reader 6 will observe time going backwards versus reader 5. * - * While other CPUs are likely to be able observe that, the only way + * While other CPUs are likely to be able to observe that, the only way * for a CPU local observation is when an NMI hits in the middle of * the update. Timestamps taken from that NMI context might be ahead * of the following timestamps. Callers need to be aware of that and * deal with it. */ -static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) -{ - struct tk_read_base *tkr; - unsigned int seq; - u64 now; - - do { - seq = raw_read_seqcount_latch(&tkf->seq); - tkr = tkf->base + (seq & 0x01); - now = ktime_to_ns(tkr->base); - - now += timekeeping_delta_to_ns(tkr, - clocksource_delta( - tk_clock_read(tkr), - tkr->cycle_last, - tkr->mask)); - } while (read_seqcount_latch_retry(&tkf->seq, seq)); - - return now; -} - u64 ktime_get_mono_fast_ns(void) { return __ktime_get_fast_ns(&tk_fast_mono); } EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); +/** + * ktime_get_raw_fast_ns - Fast NMI safe access to clock monotonic raw + * + * Contrary to ktime_get_mono_fast_ns() this is always correct because the + * conversion factor is not affected by NTP/PTP correction. + */ u64 ktime_get_raw_fast_ns(void) { return __ktime_get_fast_ns(&tk_fast_raw); @@ -521,6 +528,9 @@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be * partially updated. Since the tk->offs_boot update is a rare event, this * should be a rare occurrence which postprocessing should be able to handle. + * + * The caveats vs. timestamp ordering as documented for ktime_get_fast_ns() + * apply as well. */ u64 notrace ktime_get_boot_fast_ns(void) { @@ -530,9 +540,6 @@ u64 notrace ktime_get_boot_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); -/* - * See comment for __ktime_get_fast_ns() vs. timestamp ordering - */ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) { struct tk_read_base *tkr; @@ -557,6 +564,8 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) /** * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. + * + * See ktime_get_fast_ns() for documentation of the time stamp ordering. */ u64 ktime_get_real_fast_ns(void) { @@ -654,6 +663,7 @@ static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) /** * pvclock_gtod_register_notifier - register a pvclock timedata update listener + * @nb: Pointer to the notifier block to register */ int pvclock_gtod_register_notifier(struct notifier_block *nb) { @@ -673,6 +683,7 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); /** * pvclock_gtod_unregister_notifier - unregister a pvclock * timedata update listener + * @nb: Pointer to the notifier block to unregister */ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) { @@ -763,6 +774,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) /** * timekeeping_forward_now - update clock to the current time + * @tk: Pointer to the timekeeper to update * * Forward the current clock to update its state since the last call to * update_wall_time(). This is useful before significant clock changes, @@ -1339,7 +1351,7 @@ EXPORT_SYMBOL(do_settimeofday64); /** * timekeeping_inject_offset - Adds or subtracts from the current time. - * @tv: pointer to the timespec variable containing the offset + * @ts: Pointer to the timespec variable containing the offset * * Adds or subtracts an offset value from the current time. */ @@ -1415,9 +1427,8 @@ void timekeeping_warp_clock(void) } } -/** +/* * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic - * */ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) { @@ -1425,7 +1436,7 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0)); } -/** +/* * change_clocksource - Swaps clocksources if a new one is available * * Accumulates current time interval and initializes new clocksource @@ -1548,6 +1559,7 @@ u64 timekeeping_max_deferment(void) /** * read_persistent_clock64 - Return time from the persistent clock. + * @ts: Pointer to the storage for the readout value * * Weak dummy function for arches that do not yet support it. * Reads the time from the battery backed persistent clock. @@ -1566,8 +1578,9 @@ void __weak read_persistent_clock64(struct timespec64 *ts) * from the boot. * * Weak dummy function for arches that do not yet support it. - * wall_time - current time as returned by persistent clock - * boot_offset - offset that is defined as wall_time - boot_time + * @wall_time: - current time as returned by persistent clock + * @boot_offset: - offset that is defined as wall_time - boot_time + * * The default function calculates offset based on the current value of * local_clock(). This way architectures that support sched_clock() but don't * support dedicated boot time clock will provide the best estimate of the @@ -1652,7 +1665,8 @@ static struct timespec64 timekeeping_suspend_time; /** * __timekeeping_inject_sleeptime - Internal function to add sleep interval - * @delta: pointer to a timespec delta value + * @tk: Pointer to the timekeeper to be updated + * @delta: Pointer to the delta value in timespec64 format * * Takes a timespec offset measuring a suspend interval and properly * adds the sleep offset to the timekeeping variables. @@ -2023,13 +2037,12 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) } } -/** +/* * accumulate_nsecs_to_secs - Accumulates nsecs into secs * * Helper function that accumulates the nsecs greater than a second * from the xtime_nsec field to the xtime_secs field. * It also calls into the NTP code to handle leapsecond processing. - * */ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) { @@ -2071,7 +2084,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) return clock_set; } -/** +/* * logarithmic_accumulation - shifted accumulation of cycles * * This functions accumulates a shifted interval of cycles into @@ -2314,7 +2327,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, return base; } -/** +/* * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex */ static int timekeeping_validate_timex(const struct __kernel_timex *txc) diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 099737f6f10c..6c2cbd9ef999 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -26,7 +26,7 @@ extern void do_timer(unsigned long ticks); extern void update_wall_time(void); extern raw_spinlock_t jiffies_lock; -extern seqcount_t jiffies_seq; +extern seqcount_raw_spinlock_t jiffies_seq; #define CS_NAME_LEN 32 diff --git a/kernel/time/timer.c b/kernel/time/timer.c index de37e33a868d..8dbc008f8942 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -732,11 +732,6 @@ static inline void debug_timer_deactivate(struct timer_list *timer) debug_object_deactivate(timer, &timer_debug_descr); } -static inline void debug_timer_free(struct timer_list *timer) -{ - debug_object_free(timer, &timer_debug_descr); -} - static inline void debug_timer_assert_init(struct timer_list *timer) { debug_object_assert_init(timer, &timer_debug_descr); @@ -1288,7 +1283,7 @@ static void del_timer_wait_running(struct timer_list *timer) u32 tf; tf = READ_ONCE(timer->flags); - if (!(tf & TIMER_MIGRATING)) { + if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) { struct timer_base *base = get_timer_base(tf); /* @@ -1372,6 +1367,13 @@ int del_timer_sync(struct timer_list *timer) */ WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE)); + /* + * Must be able to sleep on PREEMPT_RT because of the slowpath in + * del_timer_wait_running(). + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE)) + lockdep_assert_preemption_enabled(); + do { ret = try_to_del_timer_sync(timer); @@ -1698,29 +1700,6 @@ void timer_clear_idle(void) } #endif -/* - * Called from the timer interrupt handler to charge one tick to the current - * process. user_tick is 1 if the tick is user time, 0 for system. - */ -void update_process_times(int user_tick) -{ - struct task_struct *p = current; - - PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0); - - /* Note: this timer irq context must be accounted for as well. */ - account_process_tick(p, user_tick); - run_local_timers(); - rcu_sched_clock_irq(user_tick); -#ifdef CONFIG_IRQ_WORK - if (in_irq()) - irq_work_tick(); -#endif - scheduler_tick(); - if (IS_ENABLED(CONFIG_POSIX_TIMERS)) - run_posix_cpu_timers(); -} - /** * __run_timers - run all expired timers (if any) on this CPU. * @base: the timer vector to be processed. @@ -1770,7 +1749,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h) /* * Called by the local, per-CPU timer interrupt on SMP. */ -void run_local_timers(void) +static void run_local_timers(void) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); @@ -1788,6 +1767,29 @@ void run_local_timers(void) } /* + * Called from the timer interrupt handler to charge one tick to the current + * process. user_tick is 1 if the tick is user time, 0 for system. + */ +void update_process_times(int user_tick) +{ + struct task_struct *p = current; + + PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0); + + /* Note: this timer irq context must be accounted for as well. */ + account_process_tick(p, user_tick); + run_local_timers(); + rcu_sched_clock_irq(user_tick); +#ifdef CONFIG_IRQ_WORK + if (in_irq()) + irq_work_tick(); +#endif + scheduler_tick(); + if (IS_ENABLED(CONFIG_POSIX_TIMERS)) + run_posix_cpu_timers(); +} + +/* * Since schedule_timeout()'s timer is defined on the stack, it must store * the target task on the stack as well. */ diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index acb326f5f50a..6939140ab7c5 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -42,24 +42,11 @@ static void SEQ_printf(struct seq_file *m, const char *fmt, ...) va_end(args); } -static void print_name_offset(struct seq_file *m, void *sym) -{ - char symname[KSYM_NAME_LEN]; - - if (lookup_symbol_name((unsigned long)sym, symname) < 0) - SEQ_printf(m, "<%pK>", sym); - else - SEQ_printf(m, "%s", symname); -} - static void print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, int idx, u64 now) { - SEQ_printf(m, " #%d: ", idx); - print_name_offset(m, taddr); - SEQ_printf(m, ", "); - print_name_offset(m, timer->function); + SEQ_printf(m, " #%d: <%pK>, %ps", idx, taddr, timer->function); SEQ_printf(m, ", S:%02x", timer->state); SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", @@ -116,9 +103,7 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); - SEQ_printf(m, " .get_time: "); - print_name_offset(m, base->get_time); - SEQ_printf(m, "\n"); + SEQ_printf(m, " .get_time: %ps\n", base->get_time); #ifdef CONFIG_HIGH_RES_TIMERS SEQ_printf(m, " .offset: %Lu nsecs\n", (unsigned long long) ktime_to_ns(base->offset)); @@ -218,42 +203,29 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) SEQ_printf(m, " next_event: %Ld nsecs\n", (unsigned long long) ktime_to_ns(dev->next_event)); - SEQ_printf(m, " set_next_event: "); - print_name_offset(m, dev->set_next_event); - SEQ_printf(m, "\n"); + SEQ_printf(m, " set_next_event: %ps\n", dev->set_next_event); - if (dev->set_state_shutdown) { - SEQ_printf(m, " shutdown: "); - print_name_offset(m, dev->set_state_shutdown); - SEQ_printf(m, "\n"); - } + if (dev->set_state_shutdown) + SEQ_printf(m, " shutdown: %ps\n", + dev->set_state_shutdown); - if (dev->set_state_periodic) { - SEQ_printf(m, " periodic: "); - print_name_offset(m, dev->set_state_periodic); - SEQ_printf(m, "\n"); - } + if (dev->set_state_periodic) + SEQ_printf(m, " periodic: %ps\n", + dev->set_state_periodic); - if (dev->set_state_oneshot) { - SEQ_printf(m, " oneshot: "); - print_name_offset(m, dev->set_state_oneshot); - SEQ_printf(m, "\n"); - } + if (dev->set_state_oneshot) + SEQ_printf(m, " oneshot: %ps\n", + dev->set_state_oneshot); - if (dev->set_state_oneshot_stopped) { - SEQ_printf(m, " oneshot stopped: "); - print_name_offset(m, dev->set_state_oneshot_stopped); - SEQ_printf(m, "\n"); - } + if (dev->set_state_oneshot_stopped) + SEQ_printf(m, " oneshot stopped: %ps\n", + dev->set_state_oneshot_stopped); - if (dev->tick_resume) { - SEQ_printf(m, " resume: "); - print_name_offset(m, dev->tick_resume); - SEQ_printf(m, "\n"); - } + if (dev->tick_resume) + SEQ_printf(m, " resume: %ps\n", + dev->tick_resume); - SEQ_printf(m, " event_handler: "); - print_name_offset(m, dev->event_handler); + SEQ_printf(m, " event_handler: %ps\n", dev->event_handler); SEQ_printf(m, "\n"); SEQ_printf(m, " retries: %lu\n", dev->retries); SEQ_printf(m, "\n"); diff --git a/kernel/torture.c b/kernel/torture.c index 1061492f14bd..8562ac18d2eb 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -602,18 +602,29 @@ static int stutter_gap; */ bool stutter_wait(const char *title) { - int spt; + ktime_t delay; + unsigned int i = 0; bool ret = false; + int spt; cond_resched_tasks_rcu_qs(); spt = READ_ONCE(stutter_pause_test); for (; spt; spt = READ_ONCE(stutter_pause_test)) { - ret = true; + if (!ret) { + sched_set_normal(current, MAX_NICE); + ret = true; + } if (spt == 1) { schedule_timeout_interruptible(1); } else if (spt == 2) { - while (READ_ONCE(stutter_pause_test)) + while (READ_ONCE(stutter_pause_test)) { + if (!(i++ & 0xffff)) { + set_current_state(TASK_INTERRUPTIBLE); + delay = 10 * NSEC_PER_USEC; + schedule_hrtimeout(&delay, HRTIMER_MODE_REL); + } cond_resched(); + } } else { schedule_timeout_interruptible(round_jiffies_relative(HZ)); } @@ -629,20 +640,27 @@ EXPORT_SYMBOL_GPL(stutter_wait); */ static int torture_stutter(void *arg) { + ktime_t delay; + DEFINE_TORTURE_RANDOM(rand); int wtime; VERBOSE_TOROUT_STRING("torture_stutter task started"); do { if (!torture_must_stop() && stutter > 1) { wtime = stutter; - if (stutter > HZ + 1) { + if (stutter > 2) { WRITE_ONCE(stutter_pause_test, 1); - wtime = stutter - HZ - 1; - schedule_timeout_interruptible(wtime); - wtime = HZ + 1; + wtime = stutter - 3; + delay = ktime_divns(NSEC_PER_SEC * wtime, HZ); + delay += (torture_random(&rand) >> 3) % NSEC_PER_MSEC; + set_current_state(TASK_INTERRUPTIBLE); + schedule_hrtimeout(&delay, HRTIMER_MODE_REL); + wtime = 2; } WRITE_ONCE(stutter_pause_test, 2); - schedule_timeout_interruptible(wtime); + delay = ktime_divns(NSEC_PER_SEC * wtime, HZ); + set_current_state(TASK_INTERRUPTIBLE); + schedule_hrtimeout(&delay, HRTIMER_MODE_REL); } WRITE_ONCE(stutter_pause_test, 0); if (!torture_must_stop()) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a4020c0b4508..e1bf5228fb69 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -202,7 +202,7 @@ config DYNAMIC_FTRACE_WITH_REGS config DYNAMIC_FTRACE_WITH_DIRECT_CALLS def_bool y - depends on DYNAMIC_FTRACE + depends on DYNAMIC_FTRACE_WITH_REGS depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS config FUNCTION_PROFILER diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4517c8b66518..ebadaa83502c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -16,6 +16,9 @@ #include <linux/syscalls.h> #include <linux/error-injection.h> #include <linux/btf_ids.h> +#include <linux/bpf_lsm.h> + +#include <net/bpf_sk_storage.h> #include <uapi/linux/bpf.h> #include <uapi/linux/btf.h> @@ -181,6 +184,16 @@ bpf_probe_read_user_str_common(void *dst, u32 size, { int ret; + /* + * NB: We rely on strncpy_from_user() not copying junk past the NUL + * terminator into `dst`. + * + * strncpy_from_user() does long-sized strides in the fast path. If the + * strncpy does not mask out the bytes after the NUL in `unsafe_ptr`, + * then there could be junk after the NUL in `dst`. If user takes `dst` + * and keys a hash map with it, then semantically identical strings can + * occupy multiple entries in the map. + */ ret = strncpy_from_user_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); @@ -1022,6 +1035,20 @@ const struct bpf_func_proto bpf_get_current_task_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_0(bpf_get_current_task_btf) +{ + return (unsigned long) current; +} + +BTF_ID_LIST_SINGLE(bpf_get_current_btf_ids, struct, task_struct) + +static const struct bpf_func_proto bpf_get_current_task_btf_proto = { + .func = bpf_get_current_task_btf, + .gpl_only = true, + .ret_type = RET_PTR_TO_BTF_ID, + .ret_btf_id = &bpf_get_current_btf_ids[0], +}; + BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) { struct bpf_array *array = container_of(map, struct bpf_array, map); @@ -1086,7 +1113,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type) return -EINVAL; work = this_cpu_ptr(&send_signal_work); - if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) + if (irq_work_is_busy(&work->irq_work)) return -EBUSY; /* Add the current task, which is the target of sending signal, @@ -1164,7 +1191,11 @@ BTF_SET_END(btf_allowlist_d_path) static bool bpf_d_path_allowed(const struct bpf_prog *prog) { - return btf_id_set_contains(&btf_allowlist_d_path, prog->aux->attach_btf_id); + if (prog->type == BPF_PROG_TYPE_LSM) + return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id); + + return btf_id_set_contains(&btf_allowlist_d_path, + prog->aux->attach_btf_id); } BTF_ID_LIST_SINGLE(bpf_d_path_btf_ids, struct, path) @@ -1198,7 +1229,7 @@ static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, *btf = bpf_get_btf_vmlinux(); if (IS_ERR_OR_NULL(*btf)) - return PTR_ERR(*btf); + return IS_ERR(*btf) ? PTR_ERR(*btf) : -EINVAL; if (ptr->type_id > 0) *btf_id = ptr->type_id; @@ -1259,12 +1290,16 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_get_current_pid_tgid: return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_get_current_task: return &bpf_get_current_task_proto; + case BPF_FUNC_get_current_task_btf: + return &bpf_get_current_task_btf_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_current_comm: @@ -1327,9 +1362,9 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; - case BPF_FUNC_bpf_per_cpu_ptr: + case BPF_FUNC_per_cpu_ptr: return &bpf_per_cpu_ptr_proto; - case BPF_FUNC_bpf_this_cpu_ptr: + case BPF_FUNC_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; default: return NULL; @@ -1719,6 +1754,12 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skc_to_tcp_request_sock_proto; case BPF_FUNC_skc_to_udp6_sock: return &bpf_skc_to_udp6_sock_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_tracing_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_tracing_proto; + case BPF_FUNC_sock_from_file: + return &bpf_sock_from_file_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? @@ -2031,10 +2072,12 @@ struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name) void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) { - struct module *mod = __module_address((unsigned long)btp); + struct module *mod; - if (mod) - module_put(mod); + preempt_disable(); + mod = __module_address((unsigned long)btp); + module_put(mod); + preempt_enable(); } static __always_inline diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8185f7240095..9c1bba8cc51b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1629,6 +1629,8 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) static struct ftrace_ops * ftrace_find_tramp_ops_any(struct dyn_ftrace *rec); static struct ftrace_ops * +ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_exclude); +static struct ftrace_ops * ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops); static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, @@ -1778,7 +1780,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, * to it. */ if (ftrace_rec_count(rec) == 1 && - ftrace_find_tramp_ops_any(rec)) + ftrace_find_tramp_ops_any_other(rec, ops)) rec->flags |= FTRACE_FL_TRAMP; else rec->flags &= ~FTRACE_FL_TRAMP; @@ -2245,6 +2247,24 @@ ftrace_find_tramp_ops_any(struct dyn_ftrace *rec) } static struct ftrace_ops * +ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_exclude) +{ + struct ftrace_ops *op; + unsigned long ip = rec->ip; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + + if (op == op_exclude || !op->trampoline) + continue; + + if (hash_contains_ip(ip, op->func_hash)) + return op; + } while_for_each_ftrace_op(op); + + return NULL; +} + +static struct ftrace_ops * ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *op) { diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7f45fd9d5a45..a6268e09160a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -438,14 +438,16 @@ enum { }; /* * Used for which event context the event is in. - * NMI = 0 - * IRQ = 1 - * SOFTIRQ = 2 - * NORMAL = 3 + * TRANSITION = 0 + * NMI = 1 + * IRQ = 2 + * SOFTIRQ = 3 + * NORMAL = 4 * * See trace_recursive_lock() comment below for more details. */ enum { + RB_CTX_TRANSITION, RB_CTX_NMI, RB_CTX_IRQ, RB_CTX_SOFTIRQ, @@ -3014,10 +3016,10 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) * a bit of overhead in something as critical as function tracing, * we use a bitmask trick. * - * bit 0 = NMI context - * bit 1 = IRQ context - * bit 2 = SoftIRQ context - * bit 3 = normal context. + * bit 1 = NMI context + * bit 2 = IRQ context + * bit 3 = SoftIRQ context + * bit 4 = normal context. * * This works because this is the order of contexts that can * preempt other contexts. A SoftIRQ never preempts an IRQ @@ -3040,6 +3042,30 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) * The least significant bit can be cleared this way, and it * just so happens that it is the same bit corresponding to * the current context. + * + * Now the TRANSITION bit breaks the above slightly. The TRANSITION bit + * is set when a recursion is detected at the current context, and if + * the TRANSITION bit is already set, it will fail the recursion. + * This is needed because there's a lag between the changing of + * interrupt context and updating the preempt count. In this case, + * a false positive will be found. To handle this, one extra recursion + * is allowed, and this is done by the TRANSITION bit. If the TRANSITION + * bit is already set, then it is considered a recursion and the function + * ends. Otherwise, the TRANSITION bit is set, and that bit is returned. + * + * On the trace_recursive_unlock(), the TRANSITION bit will be the first + * to be cleared. Even if it wasn't the context that set it. That is, + * if an interrupt comes in while NORMAL bit is set and the ring buffer + * is called before preempt_count() is updated, since the check will + * be on the NORMAL bit, the TRANSITION bit will then be set. If an + * NMI then comes in, it will set the NMI bit, but when the NMI code + * does the trace_recursive_unlock() it will clear the TRANSTION bit + * and leave the NMI bit set. But this is fine, because the interrupt + * code that set the TRANSITION bit will then clear the NMI bit when it + * calls trace_recursive_unlock(). If another NMI comes in, it will + * set the TRANSITION bit and continue. + * + * Note: The TRANSITION bit only handles a single transition between context. */ static __always_inline int @@ -3055,8 +3081,16 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) bit = pc & NMI_MASK ? RB_CTX_NMI : pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ; - if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) - return 1; + if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) { + /* + * It is possible that this was called by transitioning + * between interrupt context, and preempt_count() has not + * been updated yet. In this case, use the TRANSITION bit. + */ + bit = RB_CTX_TRANSITION; + if (val & (1 << (bit + cpu_buffer->nest))) + return 1; + } val |= (1 << (bit + cpu_buffer->nest)); cpu_buffer->current_context = val; @@ -3071,8 +3105,8 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->current_context - (1 << cpu_buffer->nest); } -/* The recursive locking above uses 4 bits */ -#define NESTED_BITS 4 +/* The recursive locking above uses 5 bits */ +#define NESTED_BITS 5 /** * ring_buffer_nest_start - Allow to trace while nested @@ -3200,14 +3234,12 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* See if we shot pass the end of this buffer page */ if (unlikely(write > BUF_PAGE_SIZE)) { - if (tail != w) { - /* before and after may now different, fix it up*/ - b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); - a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); - if (a_ok && b_ok && info->before != info->after) - (void)rb_time_cmpxchg(&cpu_buffer->before_stamp, - info->before, info->after); - } + /* before and after may now different, fix it up*/ + b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); + if (a_ok && b_ok && info->before != info->after) + (void)rb_time_cmpxchg(&cpu_buffer->before_stamp, + info->before, info->after); return rb_move_tail(cpu_buffer, tail, info); } @@ -3253,11 +3285,11 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, ts = rb_time_stamp(cpu_buffer->buffer); barrier(); /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && - info->after < ts) { + info->after < ts && + rb_time_cmpxchg(&cpu_buffer->write_stamp, + info->after, ts)) { /* Nothing came after this event between C and E */ info->delta = ts - info->after; - (void)rb_time_cmpxchg(&cpu_buffer->write_stamp, - info->after, info->ts); info->ts = ts; } else { /* diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 528971714fc6..06134189e9a7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -163,7 +163,8 @@ static union trace_eval_map_item *trace_eval_maps; #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ int tracing_set_tracer(struct trace_array *tr, const char *buf); -static void ftrace_trace_userstack(struct trace_buffer *buffer, +static void ftrace_trace_userstack(struct trace_array *tr, + struct trace_buffer *buffer, unsigned long flags, int pc); #define MAX_TRACER_SIZE 100 @@ -2750,7 +2751,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, /* * If tracing is off, but we have triggers enabled * we still need to look at the event data. Use the temp_buffer - * to store the trace event for the tigger to use. It's recusive + * to store the trace event for the trigger to use. It's recursive * safe and will not be recorded anywhere. */ if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) { @@ -2870,7 +2871,7 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, * two. They are not that meaningful. */ ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs); - ftrace_trace_userstack(buffer, flags, pc); + ftrace_trace_userstack(tr, buffer, flags, pc); } /* @@ -2952,7 +2953,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1; /* This should never happen. If it does, yell once and skip */ - if (WARN_ON_ONCE(stackidx > FTRACE_KSTACK_NESTING)) + if (WARN_ON_ONCE(stackidx >= FTRACE_KSTACK_NESTING)) goto out; /* @@ -3056,13 +3057,14 @@ EXPORT_SYMBOL_GPL(trace_dump_stack); static DEFINE_PER_CPU(int, user_stack_count); static void -ftrace_trace_userstack(struct trace_buffer *buffer, unsigned long flags, int pc) +ftrace_trace_userstack(struct trace_array *tr, + struct trace_buffer *buffer, unsigned long flags, int pc) { struct trace_event_call *call = &event_user_stack; struct ring_buffer_event *event; struct userstack_entry *entry; - if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE)) + if (!(tr->trace_flags & TRACE_ITER_USERSTACKTRACE)) return; /* @@ -3101,7 +3103,8 @@ ftrace_trace_userstack(struct trace_buffer *buffer, unsigned long flags, int pc) preempt_enable(); } #else /* CONFIG_USER_STACKTRACE_SUPPORT */ -static void ftrace_trace_userstack(struct trace_buffer *buffer, +static void ftrace_trace_userstack(struct trace_array *tr, + struct trace_buffer *buffer, unsigned long flags, int pc) { } @@ -3132,7 +3135,7 @@ static char *get_trace_buf(void) /* Interrupts must see nesting incremented before we use the buffer */ barrier(); - return &buffer->buffer[buffer->nesting][0]; + return &buffer->buffer[buffer->nesting - 1][0]; } static void put_trace_buf(void) @@ -3534,7 +3537,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, } #define STATIC_TEMP_BUF_SIZE 128 -static char static_temp_buf[STATIC_TEMP_BUF_SIZE]; +static char static_temp_buf[STATIC_TEMP_BUF_SIZE] __aligned(4); /* Find the next real entry, without updating the iterator itself */ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f3f5e77123ad..1dadef445cd1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -637,6 +637,12 @@ enum { * function is called to clear it. */ TRACE_GRAPH_NOTRACE_BIT, + + /* + * When transitioning between context, the preempt_count() may + * not be correct. Allow for a single recursion to cover this case. + */ + TRACE_TRANSITION_BIT, }; #define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) @@ -691,14 +697,27 @@ static __always_inline int trace_test_and_set_recursion(int start, int max) return 0; bit = trace_get_context_bit() + start; - if (unlikely(val & (1 << bit))) - return -1; + if (unlikely(val & (1 << bit))) { + /* + * It could be that preempt_count has not been updated during + * a switch between contexts. Allow for a single recursion. + */ + bit = TRACE_TRANSITION_BIT; + if (trace_recursion_test(bit)) + return -1; + trace_recursion_set(bit); + barrier(); + return bit + 1; + } + + /* Normal check passed, clear the transition to allow it again */ + trace_recursion_clear(TRACE_TRANSITION_BIT); val |= 1 << bit; current->trace_recursion = val; barrier(); - return bit; + return bit + 1; } static __always_inline void trace_clear_recursion(int bit) @@ -708,6 +727,7 @@ static __always_inline void trace_clear_recursion(int bit) if (!bit) return; + bit--; bit = 1 << bit; val &= ~bit; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 47a71f96e5bc..adf65b502453 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3428,10 +3428,10 @@ static __init int event_trace_enable(void) * initialize events and perhaps start any events that are on the * command line. Unfortunately, there are some events that will not * start this early, like the system call tracepoints that need - * to set the TIF_SYSCALL_TRACEPOINT flag of pid 1. But event_trace_enable() - * is called before pid 1 starts, and this flag is never set, making - * the syscall tracepoint never get reached, but the event is enabled - * regardless (and not doing anything). + * to set the %SYSCALL_WORK_SYSCALL_TRACEPOINT flag of pid 1. But + * event_trace_enable() is called before pid 1 starts, and this flag + * is never set, making the syscall tracepoint never get reached, but + * the event is enabled regardless (and not doing anything). */ static __init int event_trace_enable_again(void) { diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 3212e2c653b3..881df991742a 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -584,7 +584,8 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, { struct synth_field *field; const char *prefix = NULL, *field_type = argv[0], *field_name, *array; - int len, ret = 0; + int len, ret = -ENOMEM; + struct seq_buf s; ssize_t size; if (field_type[0] == ';') @@ -616,10 +617,9 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, len--; field->name = kmemdup_nul(field_name, len, GFP_KERNEL); - if (!field->name) { - ret = -ENOMEM; + if (!field->name) goto free; - } + if (!is_good_name(field->name)) { synth_err(SYNTH_ERR_BAD_NAME, errpos(field_name)); ret = -EINVAL; @@ -630,29 +630,29 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, field_type++; len = strlen(field_type) + 1; - if (array) { - int l = strlen(array); + if (array) + len += strlen(array); - if (l && array[l - 1] == ';') - l--; - len += l; - } if (prefix) len += strlen(prefix); field->type = kzalloc(len, GFP_KERNEL); - if (!field->type) { - ret = -ENOMEM; + if (!field->type) goto free; - } + + seq_buf_init(&s, field->type, len); if (prefix) - strcat(field->type, prefix); - strcat(field->type, field_type); + seq_buf_puts(&s, prefix); + seq_buf_puts(&s, field_type); if (array) { - strcat(field->type, array); - if (field->type[len - 1] == ';') - field->type[len - 1] = '\0'; + seq_buf_puts(&s, array); + if (s.buffer[s.len - 1] == ';') + s.len--; } + if (WARN_ON_ONCE(!seq_buf_buffer_left(&s))) + goto free; + + s.buffer[s.len] = '\0'; size = synth_field_size(field->type); if (size < 0) { @@ -663,14 +663,19 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, if (synth_field_is_string(field->type)) { char *type; - type = kzalloc(sizeof("__data_loc ") + strlen(field->type) + 1, GFP_KERNEL); - if (!type) { - ret = -ENOMEM; + len = sizeof("__data_loc ") + strlen(field->type) + 1; + type = kzalloc(len, GFP_KERNEL); + if (!type) goto free; - } - strcat(type, "__data_loc "); - strcat(type, field->type); + seq_buf_init(&s, type, len); + seq_buf_puts(&s, "__data_loc "); + seq_buf_puts(&s, field->type); + + if (WARN_ON_ONCE(!seq_buf_buffer_left(&s))) + goto free; + s.buffer[s.len] = '\0'; + kfree(field->type); field->type = type; diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index c9ad5c6fbaad..d071fc271eef 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -368,7 +368,7 @@ static int start_kthread(struct trace_array *tr) struct task_struct *kthread; int next_cpu; - if (WARN_ON(hwlat_kthread)) + if (hwlat_kthread) return 0; /* Just pick the first CPU on first iteration */ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b911e9f6d9f5..97c7a7782db7 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1731,7 +1731,8 @@ NOKPROBE_SYMBOL(kprobe_dispatcher); static int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) { - struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp); + struct kretprobe *rp = get_kretprobe(ri); + struct trace_kprobe *tk = container_of(rp, struct trace_kprobe, rp); raw_cpu_inc(*tk->nhit); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index b5e3496cf803..4738ad48a667 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -492,8 +492,13 @@ trace_selftest_function_recursion(void) unregister_ftrace_function(&test_rec_probe); ret = -1; - if (trace_selftest_recursion_cnt != 1) { - pr_cont("*callback not called once (%d)* ", + /* + * Recursion allows for transitions between context, + * and may call the callback twice. + */ + if (trace_selftest_recursion_cnt != 1 && + trace_selftest_recursion_cnt != 2) { + pr_cont("*callback not called once (or twice) (%d)* ", trace_selftest_recursion_cnt); goto out; } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 26efd22f0633..7261fa0f5e3c 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -50,7 +50,7 @@ static bool ok_to_free_tracepoints; */ struct tp_probes { struct rcu_head rcu; - struct tracepoint_func probes[0]; + struct tracepoint_func probes[]; }; static inline void *allocate_probes(int count) @@ -594,7 +594,7 @@ int syscall_regfunc(void) if (!sys_tracepoint_refcount) { read_lock(&tasklist_lock); for_each_process_thread(p, t) { - set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); + set_task_syscall_work(t, SYSCALL_TRACEPOINT); } read_unlock(&tasklist_lock); } @@ -611,7 +611,7 @@ void syscall_unregfunc(void) if (!sys_tracepoint_refcount) { read_lock(&tasklist_lock); for_each_process_thread(p, t) { - clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); + clear_task_syscall_work(t, SYSCALL_TRACEPOINT); } read_unlock(&tasklist_lock); } diff --git a/kernel/user.c b/kernel/user.c index b1635d94a1f2..a2478cddf536 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -55,7 +55,7 @@ struct user_namespace init_user_ns = { }, }, }, - .count = ATOMIC_INIT(3), + .ns.count = REFCOUNT_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .ns.inum = PROC_USER_INIT_INO, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index e703d5d9cbe8..af612945a4d0 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -111,7 +111,7 @@ int create_user_ns(struct cred *new) goto fail_free; ns->ns.ops = &userns_operations; - atomic_set(&ns->count, 1); + refcount_set(&ns->ns.count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->level = parent_ns->level + 1; @@ -197,7 +197,7 @@ static void free_user_ns(struct work_struct *work) kmem_cache_free(user_ns_cachep, ns); dec_user_namespaces(ucounts); ns = parent; - } while (atomic_dec_and_test(&parent->count)); + } while (refcount_dec_and_test(&parent->ns.count)); } void __put_user_ns(struct user_namespace *ns) diff --git a/kernel/utsname.c b/kernel/utsname.c index e488d0e2ab45..b1ac3ca870f2 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -33,7 +33,7 @@ static struct uts_namespace *create_uts_ns(void) uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL); if (uts_ns) - kref_init(&uts_ns->kref); + refcount_set(&uts_ns->ns.count, 1); return uts_ns; } @@ -103,11 +103,8 @@ struct uts_namespace *copy_utsname(unsigned long flags, return new_ns; } -void free_uts_ns(struct kref *kref) +void free_uts_ns(struct uts_namespace *ns) { - struct uts_namespace *ns; - - ns = container_of(kref, struct uts_namespace, kref); dec_uts_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5abb5b22ad13..71109065bd8e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -44,8 +44,6 @@ int __read_mostly soft_watchdog_user_enabled = 1; int __read_mostly watchdog_thresh = 10; static int __read_mostly nmi_watchdog_available; -static struct cpumask watchdog_allowed_mask __read_mostly; - struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -162,6 +160,8 @@ static void lockup_detector_update_enable(void) int __read_mostly sysctl_softlockup_all_cpu_backtrace; #endif +static struct cpumask watchdog_allowed_mask __read_mostly; + /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 437935e7a199..b5295a0b0536 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1327,6 +1327,9 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, { struct worker_pool *pool = pwq->pool; + /* record the work call stack in order to print it in KASAN reports */ + kasan_record_aux_stack(work); + /* we own @work, set data and link */ set_work_pwq(work, pwq, extra_flags); list_add_tail(&work->entry, head); @@ -4908,6 +4911,10 @@ static void unbind_workers(int cpu) pool->flags |= POOL_DISASSOCIATED; raw_spin_unlock_irq(&pool->lock); + + for_each_pool_worker(worker, pool) + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_active_mask) < 0); + mutex_unlock(&wq_pool_attach_mutex); /* |