diff options
Diffstat (limited to 'kernel')
42 files changed, 455 insertions, 265 deletions
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7d40da240891..ed2075884724 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3470,6 +3470,7 @@ static u8 bpf_ctx_convert_map[] = { [_id] = __ctx_convert##_id, #include <linux/bpf_types.h> #undef BPF_PROG_TYPE + 0, /* avoid empty array */ }; #undef BPF_MAP_TYPE diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9f90d3c92bda..9e43b72eb619 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -35,8 +35,8 @@ void cgroup_bpf_offline(struct cgroup *cgrp) */ static void cgroup_bpf_release(struct work_struct *work) { - struct cgroup *cgrp = container_of(work, struct cgroup, - bpf.release_work); + struct cgroup *p, *cgrp = container_of(work, struct cgroup, + bpf.release_work); enum bpf_cgroup_storage_type stype; struct bpf_prog_array *old_array; unsigned int type; @@ -65,6 +65,9 @@ static void cgroup_bpf_release(struct work_struct *work) mutex_unlock(&cgroup_mutex); + for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) + cgroup_bpf_put(p); + percpu_ref_exit(&cgrp->bpf.refcnt); cgroup_put(cgrp); } @@ -199,6 +202,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) */ #define NR ARRAY_SIZE(cgrp->bpf.effective) struct bpf_prog_array *arrays[NR] = {}; + struct cgroup *p; int ret, i; ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0, @@ -206,6 +210,9 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) if (ret) return ret; + for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) + cgroup_bpf_get(p); + for (i = 0; i < NR; i++) INIT_LIST_HEAD(&cgrp->bpf.progs[i]); @@ -1341,7 +1348,7 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sysctl_kern, write, - FIELD_SIZEOF(struct bpf_sysctl_kern, + sizeof_field(struct bpf_sysctl_kern, write), target_size)); break; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 49e32acad7d8..af6b738cf435 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2043,23 +2043,28 @@ static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux) for_each_cgroup_storage_type(stype) { if (!aux->cgroup_storage[stype]) continue; - bpf_cgroup_storage_release(aux->prog, - aux->cgroup_storage[stype]); + bpf_cgroup_storage_release(aux, aux->cgroup_storage[stype]); } } -static void bpf_free_used_maps(struct bpf_prog_aux *aux) +void __bpf_free_used_maps(struct bpf_prog_aux *aux, + struct bpf_map **used_maps, u32 len) { struct bpf_map *map; - int i; + u32 i; bpf_free_cgroup_storage(aux); - for (i = 0; i < aux->used_map_cnt; i++) { - map = aux->used_maps[i]; + for (i = 0; i < len; i++) { + map = used_maps[i]; if (map->ops->map_poke_untrack) map->ops->map_poke_untrack(map, aux); bpf_map_put(map); } +} + +static void bpf_free_used_maps(struct bpf_prog_aux *aux) +{ + __bpf_free_used_maps(aux, aux->used_maps, aux->used_map_cnt); kfree(aux->used_maps); } diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 2ba750725cb2..33d01866bcc2 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -20,7 +20,7 @@ struct bpf_cgroup_storage_map { struct bpf_map map; spinlock_t lock; - struct bpf_prog *prog; + struct bpf_prog_aux *aux; struct rb_root root; struct list_head list; }; @@ -357,7 +357,7 @@ static int cgroup_storage_check_btf(const struct bpf_map *map, * The first field must be a 64 bit integer at 0 offset. */ m = (struct btf_member *)(key_type + 1); - size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, cgroup_inode_id); + size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id); if (!btf_member_is_reg_int(btf, key_type, m, 0, size)) return -EINVAL; @@ -366,7 +366,7 @@ static int cgroup_storage_check_btf(const struct bpf_map *map, */ m++; offset = offsetof(struct bpf_cgroup_storage_key, attach_type); - size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, attach_type); + size = sizeof_field(struct bpf_cgroup_storage_key, attach_type); if (!btf_member_is_reg_int(btf, key_type, m, offset, size)) return -EINVAL; @@ -420,7 +420,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = { .map_seq_show_elem = cgroup_storage_seq_show_elem, }; -int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) +int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map) { enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); struct bpf_cgroup_storage_map *map = map_to_storage(_map); @@ -428,14 +428,14 @@ int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) spin_lock_bh(&map->lock); - if (map->prog && map->prog != prog) + if (map->aux && map->aux != aux) goto unlock; - if (prog->aux->cgroup_storage[stype] && - prog->aux->cgroup_storage[stype] != _map) + if (aux->cgroup_storage[stype] && + aux->cgroup_storage[stype] != _map) goto unlock; - map->prog = prog; - prog->aux->cgroup_storage[stype] = _map; + map->aux = aux; + aux->cgroup_storage[stype] = _map; ret = 0; unlock: spin_unlock_bh(&map->lock); @@ -443,16 +443,16 @@ unlock: return ret; } -void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map) +void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, struct bpf_map *_map) { enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); struct bpf_cgroup_storage_map *map = map_to_storage(_map); spin_lock_bh(&map->lock); - if (map->prog == prog) { - WARN_ON(prog->aux->cgroup_storage[stype] != _map); - map->prog = NULL; - prog->aux->cgroup_storage[stype] = NULL; + if (map->aux == aux) { + WARN_ON(aux->cgroup_storage[stype] != _map); + map->aux = NULL; + aux->cgroup_storage[stype] = NULL; } spin_unlock_bh(&map->lock); } diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index ca52b9642943..d4f335a9a899 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -44,14 +44,19 @@ struct tnum tnum_rshift(struct tnum a, u8 shift) return TNUM(a.value >> shift, a.mask >> shift); } -struct tnum tnum_arshift(struct tnum a, u8 min_shift) +struct tnum tnum_arshift(struct tnum a, u8 min_shift, u8 insn_bitness) { /* if a.value is negative, arithmetic shifting by minimum shift * will have larger negative offset compared to more shifting. * If a.value is nonnegative, arithmetic shifting by minimum shift * will have larger positive offset compare to more shifting. */ - return TNUM((s64)a.value >> min_shift, (s64)a.mask >> min_shift); + if (insn_bitness == 32) + return TNUM((u32)(((s32)a.value) >> min_shift), + (u32)(((s32)a.mask) >> min_shift)); + else + return TNUM((s64)a.value >> min_shift, + (s64)a.mask >> min_shift); } struct tnum tnum_add(struct tnum a, struct tnum b) diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 7e89f1f49d77..23b0d5cfd47e 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -3,6 +3,7 @@ #include <linux/hash.h> #include <linux/bpf.h> #include <linux/filter.h> +#include <linux/ftrace.h> /* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */ #define TRAMPOLINE_HASH_BITS 10 @@ -59,6 +60,60 @@ out: return tr; } +static int is_ftrace_location(void *ip) +{ + long addr; + + addr = ftrace_location((long)ip); + if (!addr) + return 0; + if (WARN_ON_ONCE(addr != (long)ip)) + return -EFAULT; + return 1; +} + +static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) +{ + void *ip = tr->func.addr; + int ret; + + if (tr->func.ftrace_managed) + ret = unregister_ftrace_direct((long)ip, (long)old_addr); + else + ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL); + return ret; +} + +static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr) +{ + void *ip = tr->func.addr; + int ret; + + if (tr->func.ftrace_managed) + ret = modify_ftrace_direct((long)ip, (long)old_addr, (long)new_addr); + else + ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr); + return ret; +} + +/* first time registering */ +static int register_fentry(struct bpf_trampoline *tr, void *new_addr) +{ + void *ip = tr->func.addr; + int ret; + + ret = is_ftrace_location(ip); + if (ret < 0) + return ret; + tr->func.ftrace_managed = ret; + + if (tr->func.ftrace_managed) + ret = register_ftrace_direct((long)ip, (long)new_addr); + else + ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr); + return ret; +} + /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86. Pick a number to fit into PAGE_SIZE / 2 */ @@ -77,8 +132,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) int err; if (fentry_cnt + fexit_cnt == 0) { - err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL, - old_image, NULL); + err = unregister_fentry(tr, old_image); tr->selector = 0; goto out; } @@ -105,12 +159,10 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) if (tr->selector) /* progs already running at this address */ - err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL, - old_image, new_image); + err = modify_fentry(tr, old_image, new_image); else /* first time registering */ - err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL, NULL, - new_image); + err = register_fentry(tr, new_image); if (err) goto out; tr->selector++; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 034ef81f935b..7d530ce8719d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -907,7 +907,8 @@ static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; -static void __mark_reg_not_init(struct bpf_reg_state *reg); +static void __mark_reg_not_init(const struct bpf_verifier_env *env, + struct bpf_reg_state *reg); /* Mark the unknown part of a register (variable offset or scalar value) as * known to have the value @imm. @@ -945,7 +946,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env, verbose(env, "mark_reg_known_zero(regs, %u)\n", regno); /* Something bad happened, let's kill all regs */ for (regno = 0; regno < MAX_BPF_REG; regno++) - __mark_reg_not_init(regs + regno); + __mark_reg_not_init(env, regs + regno); return; } __mark_reg_known_zero(regs + regno); @@ -1054,7 +1055,8 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg) } /* Mark a register as having a completely unknown (scalar) value. */ -static void __mark_reg_unknown(struct bpf_reg_state *reg) +static void __mark_reg_unknown(const struct bpf_verifier_env *env, + struct bpf_reg_state *reg) { /* * Clear type, id, off, and union(map_ptr, range) and @@ -1064,6 +1066,8 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg) reg->type = SCALAR_VALUE; reg->var_off = tnum_unknown; reg->frameno = 0; + reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks ? + true : false; __mark_reg_unbounded(reg); } @@ -1074,19 +1078,16 @@ static void mark_reg_unknown(struct bpf_verifier_env *env, verbose(env, "mark_reg_unknown(regs, %u)\n", regno); /* Something bad happened, let's kill all regs except FP */ for (regno = 0; regno < BPF_REG_FP; regno++) - __mark_reg_not_init(regs + regno); + __mark_reg_not_init(env, regs + regno); return; } - regs += regno; - __mark_reg_unknown(regs); - /* constant backtracking is enabled for root without bpf2bpf calls */ - regs->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks ? - true : false; + __mark_reg_unknown(env, regs + regno); } -static void __mark_reg_not_init(struct bpf_reg_state *reg) +static void __mark_reg_not_init(const struct bpf_verifier_env *env, + struct bpf_reg_state *reg) { - __mark_reg_unknown(reg); + __mark_reg_unknown(env, reg); reg->type = NOT_INIT; } @@ -1097,10 +1098,10 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, verbose(env, "mark_reg_not_init(regs, %u)\n", regno); /* Something bad happened, let's kill all regs except FP */ for (regno = 0; regno < BPF_REG_FP; regno++) - __mark_reg_not_init(regs + regno); + __mark_reg_not_init(env, regs + regno); return; } - __mark_reg_not_init(regs + regno); + __mark_reg_not_init(env, regs + regno); } #define DEF_NOT_SUBREG (0) @@ -3234,7 +3235,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, } if (state->stack[spi].slot_type[0] == STACK_SPILL && state->stack[spi].spilled_ptr.type == SCALAR_VALUE) { - __mark_reg_unknown(&state->stack[spi].spilled_ptr); + __mark_reg_unknown(env, &state->stack[spi].spilled_ptr); for (j = 0; j < BPF_REG_SIZE; j++) state->stack[spi].slot_type[j] = STACK_MISC; goto mark; @@ -3892,7 +3893,7 @@ static void __clear_all_pkt_pointers(struct bpf_verifier_env *env, if (!reg) continue; if (reg_is_pkt_pointer_any(reg)) - __mark_reg_unknown(reg); + __mark_reg_unknown(env, reg); } } @@ -3920,7 +3921,7 @@ static void release_reg_references(struct bpf_verifier_env *env, if (!reg) continue; if (reg->ref_obj_id == ref_obj_id) - __mark_reg_unknown(reg); + __mark_reg_unknown(env, reg); } } @@ -4134,6 +4135,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, struct bpf_map *map = meta->map_ptr; struct tnum range; u64 val; + int err; if (func_id != BPF_FUNC_tail_call) return 0; @@ -4150,6 +4152,10 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return 0; } + err = mark_chain_precision(env, BPF_REG_3); + if (err) + return err; + val = reg->var_off.value; if (bpf_map_key_unseen(aux)) bpf_map_key_store(aux, val); @@ -4577,7 +4583,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, /* Taint dst register if offset had invalid bounds derived from * e.g. dead branches. */ - __mark_reg_unknown(dst_reg); + __mark_reg_unknown(env, dst_reg); return 0; } @@ -4829,13 +4835,13 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Taint dst register if offset had invalid bounds derived from * e.g. dead branches. */ - __mark_reg_unknown(dst_reg); + __mark_reg_unknown(env, dst_reg); return 0; } if (!src_known && opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) { - __mark_reg_unknown(dst_reg); + __mark_reg_unknown(env, dst_reg); return 0; } @@ -5043,9 +5049,16 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Upon reaching here, src_known is true and * umax_val is equal to umin_val. */ - dst_reg->smin_value >>= umin_val; - dst_reg->smax_value >>= umin_val; - dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val); + if (insn_bitness == 32) { + dst_reg->smin_value = (u32)(((s32)dst_reg->smin_value) >> umin_val); + dst_reg->smax_value = (u32)(((s32)dst_reg->smax_value) >> umin_val); + } else { + dst_reg->smin_value >>= umin_val; + dst_reg->smax_value >>= umin_val; + } + + dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val, + insn_bitness); /* blow away the dst_reg umin_value/umax_value and rely on * dst_reg var_off to refine the result. @@ -6258,6 +6271,7 @@ static bool may_access_skb(enum bpf_prog_type type) static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) { struct bpf_reg_state *regs = cur_regs(env); + static const int ctx_reg = BPF_REG_6; u8 mode = BPF_MODE(insn->code); int i, err; @@ -6291,7 +6305,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* check whether implicit source operand (register R6) is readable */ - err = check_reg_arg(env, BPF_REG_6, SRC_OP); + err = check_reg_arg(env, ctx_reg, SRC_OP); if (err) return err; @@ -6310,7 +6324,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } - if (regs[BPF_REG_6].type != PTR_TO_CTX) { + if (regs[ctx_reg].type != PTR_TO_CTX) { verbose(env, "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); return -EINVAL; @@ -6323,6 +6337,10 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } + err = check_ctx_reg(env, ®s[ctx_reg], ctx_reg); + if (err < 0) + return err; + /* reset caller saved regs to unreadable */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); @@ -6977,7 +6995,7 @@ static void clean_func_state(struct bpf_verifier_env *env, /* since the register is unused, clear its state * to make further comparison simpler */ - __mark_reg_not_init(&st->regs[i]); + __mark_reg_not_init(env, &st->regs[i]); } for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) { @@ -6985,7 +7003,7 @@ static void clean_func_state(struct bpf_verifier_env *env, /* liveness must not touch this stack slot anymore */ st->stack[i].spilled_ptr.live |= REG_LIVE_DONE; if (!(live & REG_LIVE_READ)) { - __mark_reg_not_init(&st->stack[i].spilled_ptr); + __mark_reg_not_init(env, &st->stack[i].spilled_ptr); for (j = 0; j < BPF_REG_SIZE; j++) st->stack[i].slot_type[j] = STACK_INVALID; } @@ -8268,7 +8286,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) env->used_maps[env->used_map_cnt++] = map; if (bpf_map_is_cgroup_storage(map) && - bpf_cgroup_storage_assign(env->prog, map)) { + bpf_cgroup_storage_assign(env->prog->aux, map)) { verbose(env, "only one cgroup storage of each type is allowed\n"); fdput(f); return -EBUSY; @@ -8298,18 +8316,8 @@ next_insn: /* drop refcnt of maps used by the rejected program */ static void release_maps(struct bpf_verifier_env *env) { - enum bpf_cgroup_storage_type stype; - int i; - - for_each_cgroup_storage_type(stype) { - if (!env->prog->aux->cgroup_storage[stype]) - continue; - bpf_cgroup_storage_release(env->prog, - env->prog->aux->cgroup_storage[stype]); - } - - for (i = 0; i < env->used_map_cnt; i++) - bpf_map_put(env->used_maps[i]); + __bpf_free_used_maps(env->prog->aux, env->used_maps, + env->used_map_cnt); } /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ @@ -9282,7 +9290,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->code = BPF_JMP | BPF_TAIL_CALL; aux = &env->insn_aux_data[i + delta]; - if (prog->jit_requested && !expect_blinding && + if (env->allow_ptr_leaks && !expect_blinding && + prog->jit_requested && !bpf_map_key_poisoned(aux) && !bpf_map_ptr_poisoned(aux) && !bpf_map_ptr_unpriv(aux)) { diff --git a/kernel/cpu.c b/kernel/cpu.c index a59cc980adad..4dc279ed3b2d 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1909,6 +1909,78 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) } EXPORT_SYMBOL(__cpuhp_remove_state); +#ifdef CONFIG_HOTPLUG_SMT +static void cpuhp_offline_cpu_device(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); + + dev->offline = true; + /* Tell user space about the state change */ + kobject_uevent(&dev->kobj, KOBJ_OFFLINE); +} + +static void cpuhp_online_cpu_device(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); + + dev->offline = false; + /* Tell user space about the state change */ + kobject_uevent(&dev->kobj, KOBJ_ONLINE); +} + +int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) +{ + int cpu, ret = 0; + + cpu_maps_update_begin(); + for_each_online_cpu(cpu) { + if (topology_is_primary_thread(cpu)) + continue; + ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE); + if (ret) + break; + /* + * As this needs to hold the cpu maps lock it's impossible + * to call device_offline() because that ends up calling + * cpu_down() which takes cpu maps lock. cpu maps lock + * needs to be held as this might race against in kernel + * abusers of the hotplug machinery (thermal management). + * + * So nothing would update device:offline state. That would + * leave the sysfs entry stale and prevent onlining after + * smt control has been changed to 'off' again. This is + * called under the sysfs hotplug lock, so it is properly + * serialized against the regular offline usage. + */ + cpuhp_offline_cpu_device(cpu); + } + if (!ret) + cpu_smt_control = ctrlval; + cpu_maps_update_done(); + return ret; +} + +int cpuhp_smt_enable(void) +{ + int cpu, ret = 0; + + cpu_maps_update_begin(); + cpu_smt_control = CPU_SMT_ENABLED; + for_each_present_cpu(cpu) { + /* Skip online CPUs and CPUs on offline nodes */ + if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) + continue; + ret = _cpu_up(cpu, 0, CPUHP_ONLINE); + if (ret) + break; + /* See comment in cpuhp_smt_disable() */ + cpuhp_online_cpu_device(cpu); + } + cpu_maps_update_done(); + return ret; +} +#endif + #if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU) static ssize_t show_cpuhp_state(struct device *dev, struct device_attribute *attr, char *buf) @@ -2063,77 +2135,6 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = { #ifdef CONFIG_HOTPLUG_SMT -static void cpuhp_offline_cpu_device(unsigned int cpu) -{ - struct device *dev = get_cpu_device(cpu); - - dev->offline = true; - /* Tell user space about the state change */ - kobject_uevent(&dev->kobj, KOBJ_OFFLINE); -} - -static void cpuhp_online_cpu_device(unsigned int cpu) -{ - struct device *dev = get_cpu_device(cpu); - - dev->offline = false; - /* Tell user space about the state change */ - kobject_uevent(&dev->kobj, KOBJ_ONLINE); -} - -int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) -{ - int cpu, ret = 0; - - cpu_maps_update_begin(); - for_each_online_cpu(cpu) { - if (topology_is_primary_thread(cpu)) - continue; - ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE); - if (ret) - break; - /* - * As this needs to hold the cpu maps lock it's impossible - * to call device_offline() because that ends up calling - * cpu_down() which takes cpu maps lock. cpu maps lock - * needs to be held as this might race against in kernel - * abusers of the hotplug machinery (thermal management). - * - * So nothing would update device:offline state. That would - * leave the sysfs entry stale and prevent onlining after - * smt control has been changed to 'off' again. This is - * called under the sysfs hotplug lock, so it is properly - * serialized against the regular offline usage. - */ - cpuhp_offline_cpu_device(cpu); - } - if (!ret) - cpu_smt_control = ctrlval; - cpu_maps_update_done(); - return ret; -} - -int cpuhp_smt_enable(void) -{ - int cpu, ret = 0; - - cpu_maps_update_begin(); - cpu_smt_control = CPU_SMT_ENABLED; - for_each_present_cpu(cpu) { - /* Skip online CPUs and CPUs on offline nodes */ - if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) - continue; - ret = _cpu_up(cpu, 0, CPUHP_ONLINE); - if (ret) - break; - /* See comment in cpuhp_smt_disable() */ - cpuhp_online_cpu_device(cpu); - } - cpu_maps_update_done(); - return ret; -} - - static ssize_t __store_smt_control(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) diff --git a/kernel/cred.c b/kernel/cred.c index c0a4c12d38b2..809a985b1793 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -175,8 +175,8 @@ void exit_creds(struct task_struct *tsk) put_cred(cred); #ifdef CONFIG_KEYS_REQUEST_CACHE - key_put(current->cached_requested_key); - current->cached_requested_key = NULL; + key_put(tsk->cached_requested_key); + tsk->cached_requested_key = NULL; #endif } @@ -223,7 +223,7 @@ struct cred *cred_alloc_blank(void) new->magic = CRED_MAGIC; #endif - if (security_cred_alloc_blank(new, GFP_KERNEL) < 0) + if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0) goto error; return new; @@ -282,7 +282,7 @@ struct cred *prepare_creds(void) new->security = NULL; #endif - if (security_prepare_creds(new, old, GFP_KERNEL) < 0) + if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; validate_creds(new); return new; @@ -715,7 +715,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) #ifdef CONFIG_SECURITY new->security = NULL; #endif - if (security_prepare_creds(new, old, GFP_KERNEL) < 0) + if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; put_cred(old); diff --git a/kernel/events/core.c b/kernel/events/core.c index 4ff86d57f9e5..2173c23c25b4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10523,7 +10523,7 @@ again: goto unlock; } - list_for_each_entry_rcu(pmu, &pmus, entry) { + list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { ret = perf_try_init_event(pmu, event); if (!ret) goto unlock; @@ -11465,8 +11465,10 @@ SYSCALL_DEFINE5(perf_event_open, } } - if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) + if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) { + err = -EINVAL; goto err_locked; + } /* * Must be under the same ctx::mutex as perf_install_in_context(), diff --git a/kernel/exit.c b/kernel/exit.c index bcbd59888e67..2833ffb0c211 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -517,10 +517,6 @@ static struct task_struct *find_child_reaper(struct task_struct *father, } write_unlock_irq(&tasklist_lock); - if (unlikely(pid_ns == &init_pid_ns)) { - panic("Attempted to kill init! exitcode=0x%08x\n", - father->signal->group_exit_code ?: father->exit_code); - } list_for_each_entry_safe(p, n, dead, ptrace_entry) { list_del_init(&p->ptrace_entry); @@ -766,6 +762,14 @@ void __noreturn do_exit(long code) acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { + /* + * If the last thread of global init has exited, panic + * immediately to get a useable coredump. + */ + if (unlikely(is_global_init(tsk))) + panic("Attempted to kill init! exitcode=0x%08x\n", + tsk->signal->group_exit_code ?: (int)code); + #ifdef CONFIG_POSIX_TIMERS hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); diff --git a/kernel/fork.c b/kernel/fork.c index 2508a4f238a3..080809560072 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2578,6 +2578,16 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, #endif #ifdef __ARCH_WANT_SYS_CLONE3 + +/* + * copy_thread implementations handle CLONE_SETTLS by reading the TLS value from + * the registers containing the syscall arguments for clone. This doesn't work + * with clone3 since the TLS value is passed in clone_args instead. + */ +#ifndef CONFIG_HAVE_COPY_THREAD_TLS +#error clone3 requires copy_thread_tls support in arch +#endif + noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) diff --git a/kernel/futex.c b/kernel/futex.c index 03c518e9747e..0cf84c8664f2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1178,6 +1178,7 @@ out_error: /** * wait_for_owner_exiting - Block until the owner has exited + * @ret: owner's current futex lock status * @exiting: Pointer to the exiting task * * Caller must hold a refcount on @exiting. diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 32282e7112d3..32406ef0d6a2 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -482,7 +482,7 @@ static struct lock_trace *save_trace(void) struct lock_trace *trace, *t2; struct hlist_head *hash_head; u32 hash; - unsigned int max_entries; + int max_entries; BUILD_BUG_ON_NOT_POWER_OF_2(STACK_TRACE_HASH_SIZE); BUILD_BUG_ON(LOCK_TRACE_SIZE_IN_LONGS >= MAX_STACK_TRACE_ENTRIES); @@ -490,10 +490,8 @@ static struct lock_trace *save_trace(void) trace = (struct lock_trace *)(stack_trace + nr_stack_trace_entries); max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries - LOCK_TRACE_SIZE_IN_LONGS; - trace->nr_entries = stack_trace_save(trace->entries, max_entries, 3); - if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES - - LOCK_TRACE_SIZE_IN_LONGS - 1) { + if (max_entries <= 0) { if (!debug_locks_off_graph_unlock()) return NULL; @@ -502,6 +500,7 @@ static struct lock_trace *save_trace(void) return NULL; } + trace->nr_entries = stack_trace_save(trace->entries, max_entries, 3); hash = jhash(trace->entries, trace->nr_entries * sizeof(trace->entries[0]), 0); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 54cc5f9286e9..5352ce50a97e 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -733,9 +733,6 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne */ void __sched mutex_unlock(struct mutex *lock) { -#ifdef CONFIG_DEBUG_MUTEXES - WARN_ON(in_interrupt()); -#endif #ifndef CONFIG_DEBUG_LOCK_ALLOC if (__mutex_unlock_fast(lock)) return; @@ -1416,7 +1413,6 @@ int __sched mutex_trylock(struct mutex *lock) #ifdef CONFIG_DEBUG_MUTEXES DEBUG_LOCKS_WARN_ON(lock->magic != lock); - WARN_ON(in_interrupt()); #endif locked = __mutex_trylock(lock); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 44e68761f432..0d9b6be9ecc8 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1226,8 +1226,8 @@ wait: * In this case, we attempt to acquire the lock again * without sleeping. */ - if ((wstate == WRITER_HANDOFF) && - (rwsem_spin_on_owner(sem, 0) == OWNER_NULL)) + if (wstate == WRITER_HANDOFF && + rwsem_spin_on_owner(sem, RWSEM_NONSPINNABLE) == OWNER_NULL) goto trylock_again; /* Block until there are no active lockers. */ diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 399669f7eba8..472dd462a40c 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -51,19 +51,19 @@ EXPORT_SYMBOL(__rwlock_init); static void spin_dump(raw_spinlock_t *lock, const char *msg) { - struct task_struct *owner = NULL; + struct task_struct *owner = READ_ONCE(lock->owner); - if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT) - owner = lock->owner; + if (owner == SPINLOCK_OWNER_INIT) + owner = NULL; printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n", msg, raw_smp_processor_id(), current->comm, task_pid_nr(current)); printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, " ".owner_cpu: %d\n", - lock, lock->magic, + lock, READ_ONCE(lock->magic), owner ? owner->comm : "<none>", owner ? task_pid_nr(owner) : -1, - lock->owner_cpu); + READ_ONCE(lock->owner_cpu)); dump_stack(); } @@ -80,16 +80,16 @@ static void spin_bug(raw_spinlock_t *lock, const char *msg) static inline void debug_spin_lock_before(raw_spinlock_t *lock) { - SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); - SPIN_BUG_ON(lock->owner == current, lock, "recursion"); - SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), + SPIN_BUG_ON(READ_ONCE(lock->magic) != SPINLOCK_MAGIC, lock, "bad magic"); + SPIN_BUG_ON(READ_ONCE(lock->owner) == current, lock, "recursion"); + SPIN_BUG_ON(READ_ONCE(lock->owner_cpu) == raw_smp_processor_id(), lock, "cpu recursion"); } static inline void debug_spin_lock_after(raw_spinlock_t *lock) { - lock->owner_cpu = raw_smp_processor_id(); - lock->owner = current; + WRITE_ONCE(lock->owner_cpu, raw_smp_processor_id()); + WRITE_ONCE(lock->owner, current); } static inline void debug_spin_unlock(raw_spinlock_t *lock) @@ -99,8 +99,8 @@ static inline void debug_spin_unlock(raw_spinlock_t *lock) SPIN_BUG_ON(lock->owner != current, lock, "wrong owner"); SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), lock, "wrong CPU"); - lock->owner = SPINLOCK_OWNER_INIT; - lock->owner_cpu = -1; + WRITE_ONCE(lock->owner, SPINLOCK_OWNER_INIT); + WRITE_ONCE(lock->owner_cpu, -1); } /* @@ -187,8 +187,8 @@ static inline void debug_write_lock_before(rwlock_t *lock) static inline void debug_write_lock_after(rwlock_t *lock) { - lock->owner_cpu = raw_smp_processor_id(); - lock->owner = current; + WRITE_ONCE(lock->owner_cpu, raw_smp_processor_id()); + WRITE_ONCE(lock->owner, current); } static inline void debug_write_unlock(rwlock_t *lock) @@ -197,8 +197,8 @@ static inline void debug_write_unlock(rwlock_t *lock) RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner"); RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), lock, "wrong CPU"); - lock->owner = SPINLOCK_OWNER_INIT; - lock->owner_cpu = -1; + WRITE_ONCE(lock->owner, SPINLOCK_OWNER_INIT); + WRITE_ONCE(lock->owner_cpu, -1); } void do_raw_write_lock(rwlock_t *lock) diff --git a/kernel/module.c b/kernel/module.c index 3a486f826224..b56f3224b161 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3730,6 +3730,7 @@ static int complete_formation(struct module *mod, struct load_info *info) module_enable_ro(mod, false); module_enable_nx(mod); + module_enable_x(mod); /* Mark state as coming so strong_try_module_get() ignores us, * but kallsyms etc. can see us. */ @@ -3752,11 +3753,6 @@ static int prepare_coming_module(struct module *mod) if (err) return err; - /* Make module executable after ftrace is enabled */ - mutex_lock(&module_mutex); - module_enable_x(mod); - mutex_unlock(&module_mutex); - blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); return 0; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index cb9ddcc08119..43d6179508d6 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -264,12 +264,17 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) return ret; } -static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) +static bool ptrace_has_cap(const struct cred *cred, struct user_namespace *ns, + unsigned int mode) { + int ret; + if (mode & PTRACE_MODE_NOAUDIT) - return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE); + ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NOAUDIT); else - return has_ns_capability(current, ns, CAP_SYS_PTRACE); + ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NONE); + + return ret == 0; } /* Returns 0 on success, -errno on denial. */ @@ -321,7 +326,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) gid_eq(caller_gid, tcred->sgid) && gid_eq(caller_gid, tcred->gid)) goto ok; - if (ptrace_has_cap(tcred->user_ns, mode)) + if (ptrace_has_cap(cred, tcred->user_ns, mode)) goto ok; rcu_read_unlock(); return -EPERM; @@ -340,7 +345,7 @@ ok: mm = task->mm; if (mm && ((get_dumpable(mm) != SUID_DUMP_USER) && - !ptrace_has_cap(mm->user_ns, mode))) + !ptrace_has_cap(cred, mm->user_ns, mode))) return -EPERM; return security_ptrace_access_check(task, mode); diff --git a/kernel/rseq.c b/kernel/rseq.c index 27c48eb7de40..a4f86a9d6937 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -310,6 +310,8 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int ret; if (flags & RSEQ_FLAG_UNREGISTER) { + if (flags & ~RSEQ_FLAG_UNREGISTER) + return -EINVAL; /* Unregister rseq for current thread. */ if (current->rseq != rseq || !current->rseq) return -EINVAL; diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index b5dcd1d83c7f..7c2fe50fd76d 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -5,6 +5,8 @@ * Copyright (C) 2016, Intel Corporation * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> */ +#include <linux/cpufreq.h> + #include "sched.h" DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); @@ -57,3 +59,19 @@ void cpufreq_remove_update_util_hook(int cpu) rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL); } EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook); + +/** + * cpufreq_this_cpu_can_update - Check if cpufreq policy can be updated. + * @policy: cpufreq policy to check. + * + * Return 'true' if: + * - the local and remote CPUs share @policy, + * - dvfs_possible_from_any_cpu is set in @policy and the local CPU is not going + * offline (in which case it is not expected to run cpufreq updates any more). + */ +bool cpufreq_this_cpu_can_update(struct cpufreq_policy *policy) +{ + return cpumask_test_cpu(smp_processor_id(), policy->cpus) || + (policy->dvfs_possible_from_any_cpu && + rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data))); +} diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 322ca8860f54..9b8916fd00a2 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -82,12 +82,10 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) * by the hardware, as calculating the frequency is pointless if * we cannot in fact act on it. * - * For the slow switching platforms, the kthread is always scheduled on - * the right set of CPUs and any CPU can find the next frequency and - * schedule the kthread. + * This is needed on the slow switching platforms too to prevent CPUs + * going offline from leaving stale IRQ work items behind. */ - if (sg_policy->policy->fast_switch_enabled && - !cpufreq_this_cpu_can_update(sg_policy->policy)) + if (!cpufreq_this_cpu_can_update(sg_policy->policy)) return false; if (unlikely(sg_policy->limits_changed)) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 08a233e97a01..ba749f579714 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7328,7 +7328,14 @@ static int detach_tasks(struct lb_env *env) load < 16 && !env->sd->nr_balance_failed) goto next; - if (load/2 > env->imbalance) + /* + * Make sure that we don't migrate too much load. + * Nevertheless, let relax the constraint if + * scheduler fails to find a good waiting task to + * migrate. + */ + if (load/2 > env->imbalance && + env->sd->nr_balance_failed <= env->sd->cache_nice_tries) goto next; env->imbalance -= load; @@ -8417,6 +8424,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, if (!idlest) return NULL; + /* The local group has been skipped because of CPU affinity */ + if (!local) + return idlest; + /* * If the local group is idler than the selected idlest group * don't try and push the task. diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 517e3719027e..ce8f6748678a 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -185,7 +185,8 @@ static void group_init(struct psi_group *group) for_each_possible_cpu(cpu) seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); - group->avg_next_update = sched_clock() + psi_period; + group->avg_last_update = sched_clock(); + group->avg_next_update = group->avg_last_update + psi_period; INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); mutex_init(&group->avgs_lock); /* Init trigger-related members */ @@ -481,7 +482,7 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value) u32 remaining; remaining = win->size - elapsed; - growth += div_u64(win->prev_growth * remaining, win->size); + growth += div64_u64(win->prev_growth * remaining, win->size); } return growth; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 12d2227e5786..b6ea3dcb57bf 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1026,6 +1026,13 @@ static long seccomp_notify_recv(struct seccomp_filter *filter, struct seccomp_notif unotif; ssize_t ret; + /* Verify that we're not given garbage to keep struct extensible. */ + ret = check_zeroed_user(buf, sizeof(unotif)); + if (ret < 0) + return ret; + if (!ret) + return -EINVAL; + memset(&unotif, 0, sizeof(unotif)); ret = down_interruptible(&filter->notif->request); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 13a0f2e6ebc2..e2ac0e37c4ae 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -554,25 +554,33 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; - struct taskstats *stats; + struct taskstats *stats_new, *stats; - if (sig->stats || thread_group_empty(tsk)) - goto ret; + /* Pairs with smp_store_release() below. */ + stats = smp_load_acquire(&sig->stats); + if (stats || thread_group_empty(tsk)) + return stats; /* No problem if kmem_cache_zalloc() fails */ - stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); + stats_new = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); spin_lock_irq(&tsk->sighand->siglock); - if (!sig->stats) { - sig->stats = stats; - stats = NULL; + stats = sig->stats; + if (!stats) { + /* + * Pairs with smp_store_release() above and order the + * kmem_cache_zalloc(). + */ + smp_store_release(&sig->stats, stats_new); + stats = stats_new; + stats_new = NULL; } spin_unlock_irq(&tsk->sighand->siglock); - if (stats) - kmem_cache_free(taskstats_cache, stats); -ret: - return sig->stats; + if (stats_new) + kmem_cache_free(taskstats_cache, stats_new); + + return stats; } /* Send pid data out on exit */ diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index ec960bb939fd..200fb2d3be99 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -14,8 +14,6 @@ #include "posix-timers.h" -static void delete_clock(struct kref *kref); - /* * Returns NULL if the posix_clock instance attached to 'fp' is old and stale. */ @@ -125,7 +123,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp) err = 0; if (!err) { - kref_get(&clk->kref); + get_device(clk->dev); fp->private_data = clk; } out: @@ -141,7 +139,7 @@ static int posix_clock_release(struct inode *inode, struct file *fp) if (clk->ops.release) err = clk->ops.release(clk); - kref_put(&clk->kref, delete_clock); + put_device(clk->dev); fp->private_data = NULL; @@ -161,38 +159,35 @@ static const struct file_operations posix_clock_file_operations = { #endif }; -int posix_clock_register(struct posix_clock *clk, dev_t devid) +int posix_clock_register(struct posix_clock *clk, struct device *dev) { int err; - kref_init(&clk->kref); init_rwsem(&clk->rwsem); cdev_init(&clk->cdev, &posix_clock_file_operations); + err = cdev_device_add(&clk->cdev, dev); + if (err) { + pr_err("%s unable to add device %d:%d\n", + dev_name(dev), MAJOR(dev->devt), MINOR(dev->devt)); + return err; + } clk->cdev.owner = clk->ops.owner; - err = cdev_add(&clk->cdev, devid, 1); + clk->dev = dev; - return err; + return 0; } EXPORT_SYMBOL_GPL(posix_clock_register); -static void delete_clock(struct kref *kref) -{ - struct posix_clock *clk = container_of(kref, struct posix_clock, kref); - - if (clk->release) - clk->release(clk); -} - void posix_clock_unregister(struct posix_clock *clk) { - cdev_del(&clk->cdev); + cdev_device_del(&clk->cdev, clk->dev); down_write(&clk->rwsem); clk->zombie = true; up_write(&clk->rwsem); - kref_put(&clk->kref, delete_clock); + put_device(clk->dev); } EXPORT_SYMBOL_GPL(posix_clock_unregister); diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 67df65f887ac..20c65a7d4e3a 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -151,6 +151,9 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, #ifdef CONFIG_COMPAT COMPAT_SYS_NI(timer_create); +#endif + +#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA) COMPAT_SYS_NI(getitimer); COMPAT_SYS_NI(setitimer); #endif diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 8b192e67aabc..a792d21cac64 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -58,8 +58,9 @@ static void tick_do_update_jiffies64(ktime_t now) /* * Do a quick check without holding jiffies_lock: + * The READ_ONCE() pairs with two updates done later in this function. */ - delta = ktime_sub(now, last_jiffies_update); + delta = ktime_sub(now, READ_ONCE(last_jiffies_update)); if (delta < tick_period) return; @@ -70,8 +71,9 @@ static void tick_do_update_jiffies64(ktime_t now) if (delta >= tick_period) { delta = ktime_sub(delta, tick_period); - last_jiffies_update = ktime_add(last_jiffies_update, - tick_period); + /* Pairs with the lockless read in this function. */ + WRITE_ONCE(last_jiffies_update, + ktime_add(last_jiffies_update, tick_period)); /* Slow path for long timeouts */ if (unlikely(delta >= tick_period)) { @@ -79,8 +81,10 @@ static void tick_do_update_jiffies64(ktime_t now) ticks = ktime_divns(delta, incr); - last_jiffies_update = ktime_add_ns(last_jiffies_update, - incr * ticks); + /* Pairs with the lockless read in this function. */ + WRITE_ONCE(last_jiffies_update, + ktime_add_ns(last_jiffies_update, + incr * ticks)); } do_timer(++ticks); diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 67e0c462b059..1af321dec0f1 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -96,11 +96,34 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, return 0; } +/* + * Not all archs define MCOUNT_INSN_SIZE which is used to look for direct + * functions. But those archs currently don't support direct functions + * anyway, and ftrace_find_rec_direct() is just a stub for them. + * Define MCOUNT_INSN_SIZE to keep those archs compiling. + */ +#ifndef MCOUNT_INSN_SIZE +/* Make sure this only works without direct calls */ +# ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +# error MCOUNT_INSN_SIZE not defined with direct calls enabled +# endif +# define MCOUNT_INSN_SIZE 0 +#endif + int function_graph_enter(unsigned long ret, unsigned long func, unsigned long frame_pointer, unsigned long *retp) { struct ftrace_graph_ent trace; + /* + * Skip graph tracing if the return location is served by direct trampoline, + * since call sequence and return addresses is unpredicatable anymore. + * Ex: BPF trampoline may call original function and may skip frame + * depending on type of BPF programs attached. + */ + if (ftrace_direct_func_count && + ftrace_find_rec_direct(ret - MCOUNT_INSN_SIZE)) + return -EBUSY; trace.func = func; trace.depth = ++current->curr_ret_depth; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 74439ab5c2b6..9bf1f2cd515e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -526,8 +526,7 @@ static int function_stat_show(struct seq_file *m, void *v) } #ifdef CONFIG_FUNCTION_GRAPH_TRACER - avg = rec->time; - do_div(avg, rec->counter); + avg = div64_ul(rec->time, rec->counter); if (tracing_thresh && (avg < tracing_thresh)) goto out; #endif @@ -553,7 +552,8 @@ static int function_stat_show(struct seq_file *m, void *v) * Divide only 1000 for ns^2 -> us^2 conversion. * trace_print_graph_duration will divide 1000 again. */ - do_div(stddev, rec->counter * (rec->counter - 1) * 1000); + stddev = div64_ul(stddev, + rec->counter * (rec->counter - 1) * 1000); } trace_seq_init(&s); @@ -2364,7 +2364,7 @@ int ftrace_direct_func_count; * Search the direct_functions hash to see if the given instruction pointer * has a direct caller attached to it. */ -static unsigned long find_rec_direct(unsigned long ip) +unsigned long ftrace_find_rec_direct(unsigned long ip) { struct ftrace_func_entry *entry; @@ -2380,7 +2380,7 @@ static void call_direct_funcs(unsigned long ip, unsigned long pip, { unsigned long addr; - addr = find_rec_direct(ip); + addr = ftrace_find_rec_direct(ip); if (!addr) return; @@ -2393,11 +2393,6 @@ struct ftrace_ops direct_ops = { | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_PERMANENT, }; -#else -static inline unsigned long find_rec_direct(unsigned long ip) -{ - return 0; -} #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /** @@ -2417,7 +2412,7 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) if ((rec->flags & FTRACE_FL_DIRECT) && (ftrace_rec_count(rec) == 1)) { - addr = find_rec_direct(rec->ip); + addr = ftrace_find_rec_direct(rec->ip); if (addr) return addr; WARN_ON_ONCE(1); @@ -2458,7 +2453,7 @@ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) /* Direct calls take precedence over trampolines */ if (rec->flags & FTRACE_FL_DIRECT_EN) { - addr = find_rec_direct(rec->ip); + addr = ftrace_find_rec_direct(rec->ip); if (addr) return addr; WARN_ON_ONCE(1); @@ -3604,7 +3599,7 @@ static int t_show(struct seq_file *m, void *v) if (rec->flags & FTRACE_FL_DIRECT) { unsigned long direct; - direct = find_rec_direct(rec->ip); + direct = ftrace_find_rec_direct(rec->ip); if (direct) seq_printf(m, "\n\tdirect-->%pS", (void *)direct); } @@ -5008,7 +5003,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) mutex_lock(&direct_mutex); /* See if there's a direct function at @ip already */ - if (find_rec_direct(ip)) + if (ftrace_find_rec_direct(ip)) goto out_unlock; ret = -ENODEV; @@ -5027,7 +5022,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) if (ip != rec->ip) { ip = rec->ip; /* Need to check this ip for a direct. */ - if (find_rec_direct(ip)) + if (ftrace_find_rec_direct(ip)) goto out_unlock; } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 4bf050fcfe3b..3f655371eaf6 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5070,7 +5070,7 @@ static __init int test_ringbuffer(void) int ret = 0; if (security_locked_down(LOCKDOWN_TRACEFS)) { - pr_warning("Lockdown is enabled, skipping ring buffer tests\n"); + pr_warn("Lockdown is enabled, skipping ring buffer tests\n"); return 0; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 23459d53d576..ddb7e7f5fe8d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1889,7 +1889,7 @@ int __init register_tracer(struct tracer *type) } if (security_locked_down(LOCKDOWN_TRACEFS)) { - pr_warning("Can not register tracer %s due to lockdown\n", + pr_warn("Can not register tracer %s due to lockdown\n", type->name); return -EPERM; } @@ -4685,6 +4685,10 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) { + if ((mask == TRACE_ITER_RECORD_TGID) || + (mask == TRACE_ITER_RECORD_CMD)) + lockdep_assert_held(&event_mutex); + /* do nothing if flag is already set */ if (!!(tr->trace_flags & mask) == !!enabled) return 0; @@ -4752,6 +4756,7 @@ static int trace_set_options(struct trace_array *tr, char *option) cmp += len; + mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); ret = match_string(trace_options, -1, cmp); @@ -4762,6 +4767,7 @@ static int trace_set_options(struct trace_array *tr, char *option) ret = set_tracer_flag(tr, 1 << ret, !neg); mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); /* * If the first trailing whitespace is replaced with '\0' by strstrip, @@ -8076,9 +8082,11 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, if (val != 0 && val != 1) return -EINVAL; + mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); ret = set_tracer_flag(tr, 1 << index, val); mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); if (ret < 0) return ret; @@ -8796,7 +8804,7 @@ struct dentry *tracing_init_dentry(void) struct trace_array *tr = &global_trace; if (security_locked_down(LOCKDOWN_TRACEFS)) { - pr_warning("Tracing disabled due to lockdown\n"); + pr_warn("Tracing disabled due to lockdown\n"); return ERR_PTR(-EPERM); } @@ -9244,7 +9252,7 @@ __init static int tracer_alloc_buffers(void) if (security_locked_down(LOCKDOWN_TRACEFS)) { - pr_warning("Tracing disabled due to lockdown\n"); + pr_warn("Tracing disabled due to lockdown\n"); return -EPERM; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c6de3cebc127..a5b614cc3887 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -320,7 +320,8 @@ void trace_event_enable_cmd_record(bool enable) struct trace_event_file *file; struct trace_array *tr; - mutex_lock(&event_mutex); + lockdep_assert_held(&event_mutex); + do_for_each_event_file(tr, file) { if (!(file->flags & EVENT_FILE_FL_ENABLED)) @@ -334,7 +335,6 @@ void trace_event_enable_cmd_record(bool enable) clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } } while_for_each_event_file(); - mutex_unlock(&event_mutex); } void trace_event_enable_tgid_record(bool enable) @@ -342,7 +342,8 @@ void trace_event_enable_tgid_record(bool enable) struct trace_event_file *file; struct trace_array *tr; - mutex_lock(&event_mutex); + lockdep_assert_held(&event_mutex); + do_for_each_event_file(tr, file) { if (!(file->flags & EVENT_FILE_FL_ENABLED)) continue; @@ -356,7 +357,6 @@ void trace_event_enable_tgid_record(bool enable) &file->flags); } } while_for_each_event_file(); - mutex_unlock(&event_mutex); } static int __ftrace_event_enable_disable(struct trace_event_file *file, diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index c9a74f82b14a..bf44f6bbd0c3 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1662,7 +1662,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir, parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); return -EINVAL; fail_mem: - kfree(filter); + __free_filter(filter); /* If any call succeeded, we still need to sync */ if (!fail) tracepoint_synchronize_unregister(); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f49d1a36d3ae..f62de5f43e79 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -911,7 +911,26 @@ static notrace void trace_event_raw_event_synth(void *__data, strscpy(str_field, str_val, STR_VAR_LEN_MAX); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { - entry->fields[n_u64] = var_ref_vals[var_ref_idx + i]; + struct synth_field *field = event->fields[i]; + u64 val = var_ref_vals[var_ref_idx + i]; + + switch (field->size) { + case 1: + *(u8 *)&entry->fields[n_u64] = (u8)val; + break; + + case 2: + *(u16 *)&entry->fields[n_u64] = (u16)val; + break; + + case 4: + *(u32 *)&entry->fields[n_u64] = (u32)val; + break; + + default: + entry->fields[n_u64] = val; + break; + } n_u64++; } } diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c index d43710718ee5..22bcf7c51d1e 100644 --- a/kernel/trace/trace_events_inject.c +++ b/kernel/trace/trace_events_inject.c @@ -17,12 +17,10 @@ static int trace_inject_entry(struct trace_event_file *file, void *rec, int len) { struct trace_event_buffer fbuffer; - struct ring_buffer *buffer; int written = 0; void *entry; rcu_read_lock_sched(); - buffer = file->tr->trace_buffer.buffer; entry = trace_event_buffer_reserve(&fbuffer, file, len); if (entry) { memcpy(entry, rec, len); @@ -197,7 +195,7 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry) unsigned long irq_flags; void *entry = NULL; int entry_size; - u64 val; + u64 val = 0; int len; entry = trace_alloc_entry(call, &entry_size); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 5e43b9664eca..617e297f46dc 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -630,7 +630,7 @@ static void start_wakeup_tracer(struct trace_array *tr) if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_migrate_task\n"); - return; + goto fail_deprobe_sched_switch; } wakeup_reset(tr); @@ -648,6 +648,8 @@ static void start_wakeup_tracer(struct trace_array *tr) printk(KERN_ERR "failed to start wakeup tracer\n"); return; +fail_deprobe_sched_switch: + unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); fail_deprobe_wake_new: unregister_trace_sched_wakeup_new(probe_wakeup, NULL); fail_deprobe: diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index 344e4c1aa09c..87de6edafd14 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -381,7 +381,7 @@ int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii) { - unsigned int save_len = s->seq.len; + unsigned int save_len = s->seq.len; if (s->full) return 0; diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 4df9a209f7ca..c557f42a9397 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -283,6 +283,11 @@ static void check_stack(unsigned long ip, unsigned long *stack) local_irq_restore(flags); } +/* Some archs may not define MCOUNT_INSN_SIZE */ +#ifndef MCOUNT_INSN_SIZE +# define MCOUNT_INSN_SIZE 0 +#endif + static void stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 9a1c22310323..9e31bfc818ff 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -148,8 +148,8 @@ static int tracing_map_cmp_atomic64(void *val_a, void *val_b) #define DEFINE_TRACING_MAP_CMP_FN(type) \ static int tracing_map_cmp_##type(void *val_a, void *val_b) \ { \ - type a = *(type *)val_a; \ - type b = *(type *)val_b; \ + type a = (type)(*(u64 *)val_a); \ + type b = (type)(*(u64 *)val_b); \ \ return (a > b) ? 1 : ((a < b) ? -1 : 0); \ } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bc88fd939f4e..cfc923558e04 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4374,8 +4374,8 @@ void destroy_workqueue(struct workqueue_struct *wq) for_each_pwq(pwq, wq) { spin_lock_irq(&pwq->pool->lock); if (WARN_ON(pwq_busy(pwq))) { - pr_warning("%s: %s has the following busy pwq\n", - __func__, wq->name); + pr_warn("%s: %s has the following busy pwq\n", + __func__, wq->name); show_pwq(pwq); spin_unlock_irq(&pwq->pool->lock); mutex_unlock(&wq->mutex); |