diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/core.c | 2 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 11 | ||||
-rw-r--r-- | kernel/dma/swiotlb.c | 31 | ||||
-rw-r--r-- | kernel/entry/common.c | 5 | ||||
-rw-r--r-- | kernel/events/core.c | 14 | ||||
-rw-r--r-- | kernel/fork.c | 8 | ||||
-rw-r--r-- | kernel/kcsan/Makefile | 2 | ||||
-rw-r--r-- | kernel/sched/core.c | 3 | ||||
-rw-r--r-- | kernel/sched/fair.c | 55 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 15 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 13 | ||||
-rw-r--r-- | kernel/trace/trace.c | 28 | ||||
-rw-r--r-- | kernel/trace/trace_events_synth.c | 19 | ||||
-rw-r--r-- | kernel/trace/trace_osnoise.c | 4 |
14 files changed, 144 insertions, 66 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b297e9f60ca1..e2d256c82072 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -972,7 +972,7 @@ static int __init bpf_jit_charge_init(void) { /* Only used as heuristic here to derive limit. */ bpf_jit_limit_max = bpf_jit_alloc_exec_limit(); - bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 2, + bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 1, PAGE_SIZE), LONG_MAX); return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 272563a0b770..d517d13878cf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3826,6 +3826,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, continue; if (type == STACK_MISC) continue; + if (type == STACK_INVALID && env->allow_uninit_stack) + continue; verbose(env, "invalid read from stack off %d+%d size %d\n", off, i, size); return -EACCES; @@ -3863,6 +3865,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, continue; if (type == STACK_ZERO) continue; + if (type == STACK_INVALID && env->allow_uninit_stack) + continue; verbose(env, "invalid read from stack off %d+%d size %d\n", off, i, size); return -EACCES; @@ -5754,7 +5758,8 @@ static int check_stack_range_initialized( stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; if (*stype == STACK_MISC) goto mark; - if (*stype == STACK_ZERO) { + if ((*stype == STACK_ZERO) || + (*stype == STACK_INVALID && env->allow_uninit_stack)) { if (clobber) { /* helper can write anything into the stack */ *stype = STACK_MISC; @@ -13936,6 +13941,10 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) continue; + if (env->allow_uninit_stack && + old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC) + continue; + /* explored stack has more populated slots than current stack * and these slots were used */ diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 03e3251cd9d2..dac42a2ad588 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -623,10 +623,10 @@ static int swiotlb_do_find_slots(struct device *dev, int area_index, phys_to_dma_unencrypted(dev, mem->start) & boundary_mask; unsigned long max_slots = get_max_slots(boundary_mask); unsigned int iotlb_align_mask = - dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1); + dma_get_min_align_mask(dev) | alloc_align_mask; unsigned int nslots = nr_slots(alloc_size), stride; - unsigned int index, wrap, count = 0, i; unsigned int offset = swiotlb_align_offset(dev, orig_addr); + unsigned int index, slots_checked, count = 0, i; unsigned long flags; unsigned int slot_base; unsigned int slot_index; @@ -635,29 +635,34 @@ static int swiotlb_do_find_slots(struct device *dev, int area_index, BUG_ON(area_index >= mem->nareas); /* + * For allocations of PAGE_SIZE or larger only look for page aligned + * allocations. + */ + if (alloc_size >= PAGE_SIZE) + iotlb_align_mask |= ~PAGE_MASK; + iotlb_align_mask &= ~(IO_TLB_SIZE - 1); + + /* * For mappings with an alignment requirement don't bother looping to - * unaligned slots once we found an aligned one. For allocations of - * PAGE_SIZE or larger only look for page aligned allocations. + * unaligned slots once we found an aligned one. */ stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1; - if (alloc_size >= PAGE_SIZE) - stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT)); - stride = max(stride, (alloc_align_mask >> IO_TLB_SHIFT) + 1); spin_lock_irqsave(&area->lock, flags); if (unlikely(nslots > mem->area_nslabs - area->used)) goto not_found; slot_base = area_index * mem->area_nslabs; - index = wrap = wrap_area_index(mem, ALIGN(area->index, stride)); + index = area->index; - do { + for (slots_checked = 0; slots_checked < mem->area_nslabs; ) { slot_index = slot_base + index; if (orig_addr && (slot_addr(tbl_dma_addr, slot_index) & iotlb_align_mask) != (orig_addr & iotlb_align_mask)) { index = wrap_area_index(mem, index + 1); + slots_checked++; continue; } @@ -673,7 +678,8 @@ static int swiotlb_do_find_slots(struct device *dev, int area_index, goto found; } index = wrap_area_index(mem, index + stride); - } while (index != wrap); + slots_checked += stride; + } not_found: spin_unlock_irqrestore(&area->lock, flags); @@ -693,10 +699,7 @@ found: /* * Update the indices to avoid searching in the next round. */ - if (index + nslots < mem->area_nslabs) - area->index = index + nslots; - else - area->index = 0; + area->index = wrap_area_index(mem, index + nslots); area->used += nslots; spin_unlock_irqrestore(&area->lock, flags); return slot_index; diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 846add8394c4..be61332c66b5 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -21,7 +21,7 @@ static __always_inline void __enter_from_user_mode(struct pt_regs *regs) arch_enter_from_user_mode(regs); lockdep_hardirqs_off(CALLER_ADDR0); - CT_WARN_ON(ct_state() != CONTEXT_USER); + CT_WARN_ON(__ct_state() != CONTEXT_USER); user_exit_irqoff(); instrumentation_begin(); @@ -192,13 +192,14 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, static void exit_to_user_mode_prepare(struct pt_regs *regs) { - unsigned long ti_work = read_thread_flags(); + unsigned long ti_work; lockdep_assert_irqs_disabled(); /* Flush pending rcuog wakeup before the last need_resched() check */ tick_nohz_user_enter_prepare(); + ti_work = read_thread_flags(); if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) ti_work = exit_to_user_mode_loop(regs, ti_work); diff --git a/kernel/events/core.c b/kernel/events/core.c index fb3e436bcd4a..435815d3be3f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -12173,7 +12173,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) /* * If its not a per-cpu rb, it must be the same task. */ - if (output_event->cpu == -1 && output_event->ctx != event->ctx) + if (output_event->cpu == -1 && output_event->hw.target != event->hw.target) goto out; /* @@ -12893,12 +12893,14 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events); __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events); - /* - * Wait for the events to quiesce before re-instating them. - */ - synchronize_rcu(); + if (!list_empty(&events)) { + /* + * Wait for the events to quiesce before re-instating them. + */ + synchronize_rcu(); - __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events); + __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events); + } mutex_unlock(&dst_ctx->mutex); mutex_unlock(&src_ctx->mutex); diff --git a/kernel/fork.c b/kernel/fork.c index 9f14e4084fc0..e7d10ad98a69 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -618,6 +618,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (retval) goto out; + mt_clear_in_rcu(vmi.mas.tree); for_each_vma(old_vmi, mpnt) { struct file *file; @@ -701,6 +702,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, retval = arch_dup_mmap(oldmm, mm); loop_out: vma_iter_free(&vmi); + if (!retval) + mt_set_in_rcu(vmi.mas.tree); out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); @@ -756,11 +759,6 @@ static void check_mm(struct mm_struct *mm) for (i = 0; i < NR_MM_COUNTERS; i++) { long x = percpu_counter_sum(&mm->rss_stat[i]); - if (likely(!x)) - continue; - - /* Making sure this is not due to race with CPU offlining. */ - x = percpu_counter_sum_all(&mm->rss_stat[i]); if (unlikely(x)) pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", mm, resident_page_types[i], x); diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index 8cf70f068d92..a45f3dfc8d14 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -16,6 +16,6 @@ obj-y := core.o debugfs.o report.o KCSAN_INSTRUMENT_BARRIERS_selftest.o := y obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o -CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer +CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -fno-omit-frame-pointer CFLAGS_kcsan_test.o += $(DISABLE_STRUCTLEAK_PLUGIN) obj-$(CONFIG_KCSAN_KUNIT_TEST) += kcsan_test.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 488655f2319f..0d18c3969f90 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2084,6 +2084,9 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) void activate_task(struct rq *rq, struct task_struct *p, int flags) { + if (task_on_rq_migrating(p)) + flags |= ENQUEUE_MIGRATED; + enqueue_task(rq, p, flags); p->on_rq = TASK_ON_RQ_QUEUED; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7a1b1f855b96..6986ea31c984 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4648,11 +4648,33 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) #endif } +static inline bool entity_is_long_sleeper(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq; + u64 sleep_time; + + if (se->exec_start == 0) + return false; + + cfs_rq = cfs_rq_of(se); + + sleep_time = rq_clock_task(rq_of(cfs_rq)); + + /* Happen while migrating because of clock task divergence */ + if (sleep_time <= se->exec_start) + return false; + + sleep_time -= se->exec_start; + if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD))) + return true; + + return false; +} + static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { u64 vruntime = cfs_rq->min_vruntime; - u64 sleep_time; /* * The 'current' period is already promised to the current tasks, @@ -4684,13 +4706,24 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) /* * Pull vruntime of the entity being placed to the base level of - * cfs_rq, to prevent boosting it if placed backwards. If the entity - * slept for a long time, don't even try to compare its vruntime with - * the base as it may be too far off and the comparison may get - * inversed due to s64 overflow. - */ - sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start; - if ((s64)sleep_time > 60LL * NSEC_PER_SEC) + * cfs_rq, to prevent boosting it if placed backwards. + * However, min_vruntime can advance much faster than real time, with + * the extreme being when an entity with the minimal weight always runs + * on the cfs_rq. If the waking entity slept for a long time, its + * vruntime difference from min_vruntime may overflow s64 and their + * comparison may get inversed, so ignore the entity's original + * vruntime in that case. + * The maximal vruntime speedup is given by the ratio of normal to + * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES. + * When placing a migrated waking entity, its exec_start has been set + * from a different rq. In order to take into account a possible + * divergence between new and prev rq's clocks task because of irq and + * stolen time, we take an additional margin. + * So, cutting off on the sleep time of + * 2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days + * should be safe. + */ + if (entity_is_long_sleeper(se)) se->vruntime = vruntime; else se->vruntime = max_vruntime(se->vruntime, vruntime); @@ -4770,6 +4803,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (flags & ENQUEUE_WAKEUP) place_entity(cfs_rq, se, 0); + /* Entity has migrated, no longer consider this task hot */ + if (flags & ENQUEUE_MIGRATED) + se->exec_start = 0; check_schedstat_required(); update_stats_enqueue_fair(cfs_rq, se, flags); @@ -7657,9 +7693,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) /* Tell new CPU we are migrated */ se->avg.last_update_time = 0; - /* We have migrated, no longer consider this task hot */ - se->exec_start = 0; - update_scan_period(p, new_cpu); } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0feea145bb29..c67bcc89a771 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5667,12 +5667,15 @@ int modify_ftrace_direct(unsigned long ip, ret = 0; } - if (unlikely(ret && new_direct)) { - direct->count++; - list_del_rcu(&new_direct->next); - synchronize_rcu_tasks(); - kfree(new_direct); - ftrace_direct_func_count--; + if (ret) { + direct->addr = old_addr; + if (unlikely(new_direct)) { + direct->count++; + list_del_rcu(&new_direct->next); + synchronize_rcu_tasks(); + kfree(new_direct); + ftrace_direct_func_count--; + } } out_unlock: diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c6f47b6cfd5f..76a2d91eecad 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3098,6 +3098,10 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) if (RB_WARN_ON(cpu_buffer, rb_is_reader_page(cpu_buffer->tail_page))) return; + /* + * No need for a memory barrier here, as the update + * of the tail_page did it for this page. + */ local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); rb_inc_page(&cpu_buffer->commit_page); @@ -3107,6 +3111,8 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) while (rb_commit_index(cpu_buffer) != rb_page_write(cpu_buffer->commit_page)) { + /* Make sure the readers see the content of what is committed. */ + smp_wmb(); local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); RB_WARN_ON(cpu_buffer, @@ -4684,7 +4690,12 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) /* * Make sure we see any padding after the write update - * (see rb_reset_tail()) + * (see rb_reset_tail()). + * + * In addition, a writer may be writing on the reader page + * if the page has not been fully filled, so the read barrier + * is also needed to make sure we see the content of what is + * committed by the writer (see rb_set_commit_to_write()). */ smp_rmb(); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 937e9676dfd4..36a6037823cd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1149,22 +1149,22 @@ static void tracing_snapshot_instance_cond(struct trace_array *tr, unsigned long flags; if (in_nmi()) { - internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); - internal_trace_puts("*** snapshot is being ignored ***\n"); + trace_array_puts(tr, "*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); + trace_array_puts(tr, "*** snapshot is being ignored ***\n"); return; } if (!tr->allocated_snapshot) { - internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n"); - internal_trace_puts("*** stopping trace here! ***\n"); - tracing_off(); + trace_array_puts(tr, "*** SNAPSHOT NOT ALLOCATED ***\n"); + trace_array_puts(tr, "*** stopping trace here! ***\n"); + tracer_tracing_off(tr); return; } /* Note, snapshot can not be used when the tracer uses it */ if (tracer->use_max_tr) { - internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n"); - internal_trace_puts("*** Can not use snapshot (sorry) ***\n"); + trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n"); + trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); return; } @@ -9516,6 +9516,7 @@ static int __remove_instance(struct trace_array *tr) tracefs_remove(tr->dir); free_percpu(tr->last_func_repeats); free_trace_buffers(tr); + clear_tracing_err_log(tr); for (i = 0; i < tr->nr_topts; i++) { kfree(tr->topts[i].topts); @@ -10393,19 +10394,20 @@ out: void __init ftrace_boot_snapshot(void) { +#ifdef CONFIG_TRACER_MAX_TRACE struct trace_array *tr; - if (snapshot_at_boot) { - tracing_snapshot(); - internal_trace_puts("** Boot snapshot taken **\n"); - } + if (!snapshot_at_boot) + return; list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (tr == &global_trace) + if (!tr->allocated_snapshot) continue; - trace_array_puts(tr, "** Boot snapshot taken **\n"); + tracing_snapshot_instance(tr); + trace_array_puts(tr, "** Boot snapshot taken **\n"); } +#endif } void __init early_trace_init(void) diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 46d0abb32d0f..d6a70aff2410 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -44,14 +44,21 @@ enum { ERRORS }; static const char *err_text[] = { ERRORS }; +static DEFINE_MUTEX(lastcmd_mutex); static char *last_cmd; static int errpos(const char *str) { + int ret = 0; + + mutex_lock(&lastcmd_mutex); if (!str || !last_cmd) - return 0; + goto out; - return err_pos(last_cmd, str); + ret = err_pos(last_cmd, str); + out: + mutex_unlock(&lastcmd_mutex); + return ret; } static void last_cmd_set(const char *str) @@ -59,18 +66,22 @@ static void last_cmd_set(const char *str) if (!str) return; + mutex_lock(&lastcmd_mutex); kfree(last_cmd); - last_cmd = kstrdup(str, GFP_KERNEL); + mutex_unlock(&lastcmd_mutex); } static void synth_err(u8 err_type, u16 err_pos) { + mutex_lock(&lastcmd_mutex); if (!last_cmd) - return; + goto out; tracing_log_err(NULL, "synthetic_events", last_cmd, err_text, err_type, err_pos); + out: + mutex_unlock(&lastcmd_mutex); } static int create_synth_event(const char *raw_command); diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 9176bb7a9bb4..4496975f2029 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1296,7 +1296,7 @@ static void notify_new_max_latency(u64 latency) rcu_read_lock(); list_for_each_entry_rcu(inst, &osnoise_instances, list) { tr = inst->tr; - if (tr->max_latency < latency) { + if (tracer_tracing_is_on(tr) && tr->max_latency < latency) { tr->max_latency = latency; latency_fsnotify(tr); } @@ -1738,6 +1738,8 @@ static int timerlat_main(void *data) trace_timerlat_sample(&s); + notify_new_max_latency(diff); + timerlat_dump_stack(time_to_us(diff)); tlat->tracing_thread = false; |