summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/core.c2
-rw-r--r--kernel/bpf/verifier.c11
-rw-r--r--kernel/dma/swiotlb.c31
-rw-r--r--kernel/entry/common.c5
-rw-r--r--kernel/events/core.c14
-rw-r--r--kernel/fork.c8
-rw-r--r--kernel/kcsan/Makefile2
-rw-r--r--kernel/sched/core.c3
-rw-r--r--kernel/sched/fair.c55
-rw-r--r--kernel/trace/ftrace.c15
-rw-r--r--kernel/trace/ring_buffer.c13
-rw-r--r--kernel/trace/trace.c28
-rw-r--r--kernel/trace/trace_events_synth.c19
-rw-r--r--kernel/trace/trace_osnoise.c4
14 files changed, 144 insertions, 66 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b297e9f60ca1..e2d256c82072 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -972,7 +972,7 @@ static int __init bpf_jit_charge_init(void)
{
/* Only used as heuristic here to derive limit. */
bpf_jit_limit_max = bpf_jit_alloc_exec_limit();
- bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 2,
+ bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 1,
PAGE_SIZE), LONG_MAX);
return 0;
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 272563a0b770..d517d13878cf 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3826,6 +3826,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
continue;
if (type == STACK_MISC)
continue;
+ if (type == STACK_INVALID && env->allow_uninit_stack)
+ continue;
verbose(env, "invalid read from stack off %d+%d size %d\n",
off, i, size);
return -EACCES;
@@ -3863,6 +3865,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
continue;
if (type == STACK_ZERO)
continue;
+ if (type == STACK_INVALID && env->allow_uninit_stack)
+ continue;
verbose(env, "invalid read from stack off %d+%d size %d\n",
off, i, size);
return -EACCES;
@@ -5754,7 +5758,8 @@ static int check_stack_range_initialized(
stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
if (*stype == STACK_MISC)
goto mark;
- if (*stype == STACK_ZERO) {
+ if ((*stype == STACK_ZERO) ||
+ (*stype == STACK_INVALID && env->allow_uninit_stack)) {
if (clobber) {
/* helper can write anything into the stack */
*stype = STACK_MISC;
@@ -13936,6 +13941,10 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
continue;
+ if (env->allow_uninit_stack &&
+ old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC)
+ continue;
+
/* explored stack has more populated slots than current stack
* and these slots were used
*/
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 03e3251cd9d2..dac42a2ad588 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -623,10 +623,10 @@ static int swiotlb_do_find_slots(struct device *dev, int area_index,
phys_to_dma_unencrypted(dev, mem->start) & boundary_mask;
unsigned long max_slots = get_max_slots(boundary_mask);
unsigned int iotlb_align_mask =
- dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1);
+ dma_get_min_align_mask(dev) | alloc_align_mask;
unsigned int nslots = nr_slots(alloc_size), stride;
- unsigned int index, wrap, count = 0, i;
unsigned int offset = swiotlb_align_offset(dev, orig_addr);
+ unsigned int index, slots_checked, count = 0, i;
unsigned long flags;
unsigned int slot_base;
unsigned int slot_index;
@@ -635,29 +635,34 @@ static int swiotlb_do_find_slots(struct device *dev, int area_index,
BUG_ON(area_index >= mem->nareas);
/*
+ * For allocations of PAGE_SIZE or larger only look for page aligned
+ * allocations.
+ */
+ if (alloc_size >= PAGE_SIZE)
+ iotlb_align_mask |= ~PAGE_MASK;
+ iotlb_align_mask &= ~(IO_TLB_SIZE - 1);
+
+ /*
* For mappings with an alignment requirement don't bother looping to
- * unaligned slots once we found an aligned one. For allocations of
- * PAGE_SIZE or larger only look for page aligned allocations.
+ * unaligned slots once we found an aligned one.
*/
stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1;
- if (alloc_size >= PAGE_SIZE)
- stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT));
- stride = max(stride, (alloc_align_mask >> IO_TLB_SHIFT) + 1);
spin_lock_irqsave(&area->lock, flags);
if (unlikely(nslots > mem->area_nslabs - area->used))
goto not_found;
slot_base = area_index * mem->area_nslabs;
- index = wrap = wrap_area_index(mem, ALIGN(area->index, stride));
+ index = area->index;
- do {
+ for (slots_checked = 0; slots_checked < mem->area_nslabs; ) {
slot_index = slot_base + index;
if (orig_addr &&
(slot_addr(tbl_dma_addr, slot_index) &
iotlb_align_mask) != (orig_addr & iotlb_align_mask)) {
index = wrap_area_index(mem, index + 1);
+ slots_checked++;
continue;
}
@@ -673,7 +678,8 @@ static int swiotlb_do_find_slots(struct device *dev, int area_index,
goto found;
}
index = wrap_area_index(mem, index + stride);
- } while (index != wrap);
+ slots_checked += stride;
+ }
not_found:
spin_unlock_irqrestore(&area->lock, flags);
@@ -693,10 +699,7 @@ found:
/*
* Update the indices to avoid searching in the next round.
*/
- if (index + nslots < mem->area_nslabs)
- area->index = index + nslots;
- else
- area->index = 0;
+ area->index = wrap_area_index(mem, index + nslots);
area->used += nslots;
spin_unlock_irqrestore(&area->lock, flags);
return slot_index;
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 846add8394c4..be61332c66b5 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -21,7 +21,7 @@ static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
arch_enter_from_user_mode(regs);
lockdep_hardirqs_off(CALLER_ADDR0);
- CT_WARN_ON(ct_state() != CONTEXT_USER);
+ CT_WARN_ON(__ct_state() != CONTEXT_USER);
user_exit_irqoff();
instrumentation_begin();
@@ -192,13 +192,14 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
static void exit_to_user_mode_prepare(struct pt_regs *regs)
{
- unsigned long ti_work = read_thread_flags();
+ unsigned long ti_work;
lockdep_assert_irqs_disabled();
/* Flush pending rcuog wakeup before the last need_resched() check */
tick_nohz_user_enter_prepare();
+ ti_work = read_thread_flags();
if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
ti_work = exit_to_user_mode_loop(regs, ti_work);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fb3e436bcd4a..435815d3be3f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -12173,7 +12173,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
/*
* If its not a per-cpu rb, it must be the same task.
*/
- if (output_event->cpu == -1 && output_event->ctx != event->ctx)
+ if (output_event->cpu == -1 && output_event->hw.target != event->hw.target)
goto out;
/*
@@ -12893,12 +12893,14 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
__perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events);
__perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events);
- /*
- * Wait for the events to quiesce before re-instating them.
- */
- synchronize_rcu();
+ if (!list_empty(&events)) {
+ /*
+ * Wait for the events to quiesce before re-instating them.
+ */
+ synchronize_rcu();
- __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events);
+ __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events);
+ }
mutex_unlock(&dst_ctx->mutex);
mutex_unlock(&src_ctx->mutex);
diff --git a/kernel/fork.c b/kernel/fork.c
index 9f14e4084fc0..e7d10ad98a69 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -618,6 +618,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
if (retval)
goto out;
+ mt_clear_in_rcu(vmi.mas.tree);
for_each_vma(old_vmi, mpnt) {
struct file *file;
@@ -701,6 +702,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
retval = arch_dup_mmap(oldmm, mm);
loop_out:
vma_iter_free(&vmi);
+ if (!retval)
+ mt_set_in_rcu(vmi.mas.tree);
out:
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
@@ -756,11 +759,6 @@ static void check_mm(struct mm_struct *mm)
for (i = 0; i < NR_MM_COUNTERS; i++) {
long x = percpu_counter_sum(&mm->rss_stat[i]);
- if (likely(!x))
- continue;
-
- /* Making sure this is not due to race with CPU offlining. */
- x = percpu_counter_sum_all(&mm->rss_stat[i]);
if (unlikely(x))
pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
mm, resident_page_types[i], x);
diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
index 8cf70f068d92..a45f3dfc8d14 100644
--- a/kernel/kcsan/Makefile
+++ b/kernel/kcsan/Makefile
@@ -16,6 +16,6 @@ obj-y := core.o debugfs.o report.o
KCSAN_INSTRUMENT_BARRIERS_selftest.o := y
obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o
-CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer
+CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -fno-omit-frame-pointer
CFLAGS_kcsan_test.o += $(DISABLE_STRUCTLEAK_PLUGIN)
obj-$(CONFIG_KCSAN_KUNIT_TEST) += kcsan_test.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 488655f2319f..0d18c3969f90 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2084,6 +2084,9 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
+ if (task_on_rq_migrating(p))
+ flags |= ENQUEUE_MIGRATED;
+
enqueue_task(rq, p, flags);
p->on_rq = TASK_ON_RQ_QUEUED;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a1b1f855b96..6986ea31c984 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4648,11 +4648,33 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
#endif
}
+static inline bool entity_is_long_sleeper(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq;
+ u64 sleep_time;
+
+ if (se->exec_start == 0)
+ return false;
+
+ cfs_rq = cfs_rq_of(se);
+
+ sleep_time = rq_clock_task(rq_of(cfs_rq));
+
+ /* Happen while migrating because of clock task divergence */
+ if (sleep_time <= se->exec_start)
+ return false;
+
+ sleep_time -= se->exec_start;
+ if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD)))
+ return true;
+
+ return false;
+}
+
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
u64 vruntime = cfs_rq->min_vruntime;
- u64 sleep_time;
/*
* The 'current' period is already promised to the current tasks,
@@ -4684,13 +4706,24 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
/*
* Pull vruntime of the entity being placed to the base level of
- * cfs_rq, to prevent boosting it if placed backwards. If the entity
- * slept for a long time, don't even try to compare its vruntime with
- * the base as it may be too far off and the comparison may get
- * inversed due to s64 overflow.
- */
- sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
- if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
+ * cfs_rq, to prevent boosting it if placed backwards.
+ * However, min_vruntime can advance much faster than real time, with
+ * the extreme being when an entity with the minimal weight always runs
+ * on the cfs_rq. If the waking entity slept for a long time, its
+ * vruntime difference from min_vruntime may overflow s64 and their
+ * comparison may get inversed, so ignore the entity's original
+ * vruntime in that case.
+ * The maximal vruntime speedup is given by the ratio of normal to
+ * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES.
+ * When placing a migrated waking entity, its exec_start has been set
+ * from a different rq. In order to take into account a possible
+ * divergence between new and prev rq's clocks task because of irq and
+ * stolen time, we take an additional margin.
+ * So, cutting off on the sleep time of
+ * 2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days
+ * should be safe.
+ */
+ if (entity_is_long_sleeper(se))
se->vruntime = vruntime;
else
se->vruntime = max_vruntime(se->vruntime, vruntime);
@@ -4770,6 +4803,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (flags & ENQUEUE_WAKEUP)
place_entity(cfs_rq, se, 0);
+ /* Entity has migrated, no longer consider this task hot */
+ if (flags & ENQUEUE_MIGRATED)
+ se->exec_start = 0;
check_schedstat_required();
update_stats_enqueue_fair(cfs_rq, se, flags);
@@ -7657,9 +7693,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
/* Tell new CPU we are migrated */
se->avg.last_update_time = 0;
- /* We have migrated, no longer consider this task hot */
- se->exec_start = 0;
-
update_scan_period(p, new_cpu);
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0feea145bb29..c67bcc89a771 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5667,12 +5667,15 @@ int modify_ftrace_direct(unsigned long ip,
ret = 0;
}
- if (unlikely(ret && new_direct)) {
- direct->count++;
- list_del_rcu(&new_direct->next);
- synchronize_rcu_tasks();
- kfree(new_direct);
- ftrace_direct_func_count--;
+ if (ret) {
+ direct->addr = old_addr;
+ if (unlikely(new_direct)) {
+ direct->count++;
+ list_del_rcu(&new_direct->next);
+ synchronize_rcu_tasks();
+ kfree(new_direct);
+ ftrace_direct_func_count--;
+ }
}
out_unlock:
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c6f47b6cfd5f..76a2d91eecad 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3098,6 +3098,10 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
if (RB_WARN_ON(cpu_buffer,
rb_is_reader_page(cpu_buffer->tail_page)))
return;
+ /*
+ * No need for a memory barrier here, as the update
+ * of the tail_page did it for this page.
+ */
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
rb_inc_page(&cpu_buffer->commit_page);
@@ -3107,6 +3111,8 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
while (rb_commit_index(cpu_buffer) !=
rb_page_write(cpu_buffer->commit_page)) {
+ /* Make sure the readers see the content of what is committed. */
+ smp_wmb();
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
RB_WARN_ON(cpu_buffer,
@@ -4684,7 +4690,12 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
/*
* Make sure we see any padding after the write update
- * (see rb_reset_tail())
+ * (see rb_reset_tail()).
+ *
+ * In addition, a writer may be writing on the reader page
+ * if the page has not been fully filled, so the read barrier
+ * is also needed to make sure we see the content of what is
+ * committed by the writer (see rb_set_commit_to_write()).
*/
smp_rmb();
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 937e9676dfd4..36a6037823cd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1149,22 +1149,22 @@ static void tracing_snapshot_instance_cond(struct trace_array *tr,
unsigned long flags;
if (in_nmi()) {
- internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
- internal_trace_puts("*** snapshot is being ignored ***\n");
+ trace_array_puts(tr, "*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
+ trace_array_puts(tr, "*** snapshot is being ignored ***\n");
return;
}
if (!tr->allocated_snapshot) {
- internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n");
- internal_trace_puts("*** stopping trace here! ***\n");
- tracing_off();
+ trace_array_puts(tr, "*** SNAPSHOT NOT ALLOCATED ***\n");
+ trace_array_puts(tr, "*** stopping trace here! ***\n");
+ tracer_tracing_off(tr);
return;
}
/* Note, snapshot can not be used when the tracer uses it */
if (tracer->use_max_tr) {
- internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n");
- internal_trace_puts("*** Can not use snapshot (sorry) ***\n");
+ trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n");
+ trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
return;
}
@@ -9516,6 +9516,7 @@ static int __remove_instance(struct trace_array *tr)
tracefs_remove(tr->dir);
free_percpu(tr->last_func_repeats);
free_trace_buffers(tr);
+ clear_tracing_err_log(tr);
for (i = 0; i < tr->nr_topts; i++) {
kfree(tr->topts[i].topts);
@@ -10393,19 +10394,20 @@ out:
void __init ftrace_boot_snapshot(void)
{
+#ifdef CONFIG_TRACER_MAX_TRACE
struct trace_array *tr;
- if (snapshot_at_boot) {
- tracing_snapshot();
- internal_trace_puts("** Boot snapshot taken **\n");
- }
+ if (!snapshot_at_boot)
+ return;
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
- if (tr == &global_trace)
+ if (!tr->allocated_snapshot)
continue;
- trace_array_puts(tr, "** Boot snapshot taken **\n");
+
tracing_snapshot_instance(tr);
+ trace_array_puts(tr, "** Boot snapshot taken **\n");
}
+#endif
}
void __init early_trace_init(void)
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 46d0abb32d0f..d6a70aff2410 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -44,14 +44,21 @@ enum { ERRORS };
static const char *err_text[] = { ERRORS };
+static DEFINE_MUTEX(lastcmd_mutex);
static char *last_cmd;
static int errpos(const char *str)
{
+ int ret = 0;
+
+ mutex_lock(&lastcmd_mutex);
if (!str || !last_cmd)
- return 0;
+ goto out;
- return err_pos(last_cmd, str);
+ ret = err_pos(last_cmd, str);
+ out:
+ mutex_unlock(&lastcmd_mutex);
+ return ret;
}
static void last_cmd_set(const char *str)
@@ -59,18 +66,22 @@ static void last_cmd_set(const char *str)
if (!str)
return;
+ mutex_lock(&lastcmd_mutex);
kfree(last_cmd);
-
last_cmd = kstrdup(str, GFP_KERNEL);
+ mutex_unlock(&lastcmd_mutex);
}
static void synth_err(u8 err_type, u16 err_pos)
{
+ mutex_lock(&lastcmd_mutex);
if (!last_cmd)
- return;
+ goto out;
tracing_log_err(NULL, "synthetic_events", last_cmd, err_text,
err_type, err_pos);
+ out:
+ mutex_unlock(&lastcmd_mutex);
}
static int create_synth_event(const char *raw_command);
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 9176bb7a9bb4..4496975f2029 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -1296,7 +1296,7 @@ static void notify_new_max_latency(u64 latency)
rcu_read_lock();
list_for_each_entry_rcu(inst, &osnoise_instances, list) {
tr = inst->tr;
- if (tr->max_latency < latency) {
+ if (tracer_tracing_is_on(tr) && tr->max_latency < latency) {
tr->max_latency = latency;
latency_fsnotify(tr);
}
@@ -1738,6 +1738,8 @@ static int timerlat_main(void *data)
trace_timerlat_sample(&s);
+ notify_new_max_latency(diff);
+
timerlat_dump_stack(time_to_us(diff));
tlat->tracing_thread = false;