diff options
Diffstat (limited to '')
-rw-r--r-- | kernel/bpf/cpumap.c | 35 | ||||
-rw-r--r-- | kernel/power/hibernate.c | 2 | ||||
-rw-r--r-- | kernel/trace/bpf_trace.c | 17 | ||||
-rw-r--r-- | kernel/workqueue.c | 43 |
4 files changed, 76 insertions, 21 deletions
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 6ae02be7a48e..286ab3db0fde 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -28,6 +28,7 @@ #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/kthread.h> +#include <linux/completion.h> #include <trace/events/xdp.h> #include <linux/btf_ids.h> @@ -73,6 +74,7 @@ struct bpf_cpu_map_entry { struct rcu_head rcu; struct work_struct kthread_stop_wq; + struct completion kthread_running; }; struct bpf_cpu_map { @@ -129,11 +131,17 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring) * invoked cpu_map_kthread_stop(). Catch any broken behaviour * gracefully and warn once. */ - struct xdp_frame *xdpf; + void *ptr; - while ((xdpf = ptr_ring_consume(ring))) - if (WARN_ON_ONCE(xdpf)) - xdp_return_frame(xdpf); + while ((ptr = ptr_ring_consume(ring))) { + WARN_ON_ONCE(1); + if (unlikely(__ptr_test_bit(0, &ptr))) { + __ptr_clear_bit(0, &ptr); + kfree_skb(ptr); + continue; + } + xdp_return_frame(ptr); + } } static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) @@ -153,7 +161,6 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) static void cpu_map_kthread_stop(struct work_struct *work) { struct bpf_cpu_map_entry *rcpu; - int err; rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq); @@ -163,14 +170,7 @@ static void cpu_map_kthread_stop(struct work_struct *work) rcu_barrier(); /* kthread_stop will wake_up_process and wait for it to complete */ - err = kthread_stop(rcpu->kthread); - if (err) { - /* kthread_stop may be called before cpu_map_kthread_run - * is executed, so we need to release the memory related - * to rcpu. - */ - put_cpu_map_entry(rcpu); - } + kthread_stop(rcpu->kthread); } static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, @@ -298,11 +298,11 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, return nframes; } - static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; + complete(&rcpu->kthread_running); set_current_state(TASK_INTERRUPTIBLE); /* When kthread gives stop order, then rcpu have been disconnected @@ -467,6 +467,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, goto free_ptr_ring; /* Setup kthread */ + init_completion(&rcpu->kthread_running); rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, "cpumap/%d/map:%d", cpu, map->id); @@ -480,6 +481,12 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, kthread_bind(rcpu->kthread, cpu); wake_up_process(rcpu->kthread); + /* Make sure kthread has been running, so kthread_stop() will not + * stop the kthread prematurely and all pending frames or skbs + * will be handled by the kthread before kthread_stop() returns. + */ + wait_for_completion(&rcpu->kthread_running); + return rcpu; free_prog: diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index e1b4bfa938dd..2b4a946a6ff5 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1166,7 +1166,7 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, int error; if (!hibernation_available()) - return 0; + return n; if (len && buf[len-1] == '\n') len--; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 5f2dcabad202..bd1a42b23f3f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -661,8 +661,7 @@ static DEFINE_PER_CPU(int, bpf_trace_nest_level); BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, u64, flags, void *, data, u64, size) { - struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds); - int nest_level = this_cpu_inc_return(bpf_trace_nest_level); + struct bpf_trace_sample_data *sds; struct perf_raw_record raw = { .frag = { .size = size, @@ -670,7 +669,11 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, }, }; struct perf_sample_data *sd; - int err; + int nest_level, err; + + preempt_disable(); + sds = this_cpu_ptr(&bpf_trace_sds); + nest_level = this_cpu_inc_return(bpf_trace_nest_level); if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) { err = -EBUSY; @@ -688,9 +691,9 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, perf_sample_save_raw_data(sd, &raw); err = __bpf_perf_event_output(regs, map, flags, sd); - out: this_cpu_dec(bpf_trace_nest_level); + preempt_enable(); return err; } @@ -715,7 +718,6 @@ static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds); u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { - int nest_level = this_cpu_inc_return(bpf_event_output_nest_level); struct perf_raw_frag frag = { .copy = ctx_copy, .size = ctx_size, @@ -732,8 +734,12 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, }; struct perf_sample_data *sd; struct pt_regs *regs; + int nest_level; u64 ret; + preempt_disable(); + nest_level = this_cpu_inc_return(bpf_event_output_nest_level); + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) { ret = -EBUSY; goto out; @@ -748,6 +754,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, ret = __bpf_perf_event_output(regs, map, flags, sd); out: this_cpu_dec(bpf_event_output_nest_level); + preempt_enable(); return ret; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 02a8f402eeb5..800b4208dba9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -52,6 +52,7 @@ #include <linux/sched/debug.h> #include <linux/nmi.h> #include <linux/kvm_para.h> +#include <linux/delay.h> #include "workqueue_internal.h" @@ -338,8 +339,10 @@ static cpumask_var_t *wq_numa_possible_cpumask; * Per-cpu work items which run for longer than the following threshold are * automatically considered CPU intensive and excluded from concurrency * management to prevent them from noticeably delaying other per-cpu work items. + * ULONG_MAX indicates that the user hasn't overridden it with a boot parameter. + * The actual value is initialized in wq_cpu_intensive_thresh_init(). */ -static unsigned long wq_cpu_intensive_thresh_us = 10000; +static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX; module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644); static bool wq_disable_numa; @@ -6513,6 +6516,42 @@ void __init workqueue_init_early(void) !system_freezable_power_efficient_wq); } +static void __init wq_cpu_intensive_thresh_init(void) +{ + unsigned long thresh; + unsigned long bogo; + + /* if the user set it to a specific value, keep it */ + if (wq_cpu_intensive_thresh_us != ULONG_MAX) + return; + + /* + * The default of 10ms is derived from the fact that most modern (as of + * 2023) processors can do a lot in 10ms and that it's just below what + * most consider human-perceivable. However, the kernel also runs on a + * lot slower CPUs including microcontrollers where the threshold is way + * too low. + * + * Let's scale up the threshold upto 1 second if BogoMips is below 4000. + * This is by no means accurate but it doesn't have to be. The mechanism + * is still useful even when the threshold is fully scaled up. Also, as + * the reports would usually be applicable to everyone, some machines + * operating on longer thresholds won't significantly diminish their + * usefulness. + */ + thresh = 10 * USEC_PER_MSEC; + + /* see init/calibrate.c for lpj -> BogoMIPS calculation */ + bogo = max_t(unsigned long, loops_per_jiffy / 500000 * HZ, 1); + if (bogo < 4000) + thresh = min_t(unsigned long, thresh * 4000 / bogo, USEC_PER_SEC); + + pr_debug("wq_cpu_intensive_thresh: lpj=%lu BogoMIPS=%lu thresh_us=%lu\n", + loops_per_jiffy, bogo, thresh); + + wq_cpu_intensive_thresh_us = thresh; +} + /** * workqueue_init - bring workqueue subsystem fully online * @@ -6528,6 +6567,8 @@ void __init workqueue_init(void) struct worker_pool *pool; int cpu, bkt; + wq_cpu_intensive_thresh_init(); + /* * It'd be simpler to initialize NUMA in workqueue_init_early() but * CPU to node mapping may not be available that early on some |