From a85ee6401a47ae3fc64ba506cacb3e7873823c65 Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Sun, 23 Jan 2022 20:45:08 +0800 Subject: cpufreq: governor: Use kobject release() method to free dbs_data The struct dbs_data embeds a struct gov_attr_set and the struct gov_attr_set embeds a kobject. Since every kobject must have a release() method and we can't use kfree() to free it directly, so introduce cpufreq_dbs_data_release() to release the dbs_data via the kobject::release() method. This fixes the calltrace like below: ODEBUG: free active (active state 0) object type: timer_list hint: delayed_work_timer_fn+0x0/0x34 WARNING: CPU: 12 PID: 810 at lib/debugobjects.c:505 debug_print_object+0xb8/0x100 Modules linked in: CPU: 12 PID: 810 Comm: sh Not tainted 5.16.0-next-20220120-yocto-standard+ #536 Hardware name: Marvell OcteonTX CN96XX board (DT) pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : debug_print_object+0xb8/0x100 lr : debug_print_object+0xb8/0x100 sp : ffff80001dfcf9a0 x29: ffff80001dfcf9a0 x28: 0000000000000001 x27: ffff0001464f0000 x26: 0000000000000000 x25: ffff8000090e3f00 x24: ffff80000af60210 x23: ffff8000094dfb78 x22: ffff8000090e3f00 x21: ffff0001080b7118 x20: ffff80000aeb2430 x19: ffff800009e8f5e0 x18: 0000000000000000 x17: 0000000000000002 x16: 00004d62e58be040 x15: 013590470523aff8 x14: ffff8000090e1828 x13: 0000000001359047 x12: 00000000f5257d14 x11: 0000000000040591 x10: 0000000066c1ffea x9 : ffff8000080d15e0 x8 : ffff80000a1765a8 x7 : 0000000000000000 x6 : 0000000000000001 x5 : ffff800009e8c000 x4 : ffff800009e8c760 x3 : 0000000000000000 x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff0001474ed040 Call trace: debug_print_object+0xb8/0x100 __debug_check_no_obj_freed+0x1d0/0x25c debug_check_no_obj_freed+0x24/0xa0 kfree+0x11c/0x440 cpufreq_dbs_governor_exit+0xa8/0xac cpufreq_exit_governor+0x44/0x90 cpufreq_set_policy+0x29c/0x570 store_scaling_governor+0x110/0x154 store+0xb0/0xe0 sysfs_kf_write+0x58/0x84 kernfs_fop_write_iter+0x12c/0x1c0 new_sync_write+0xf0/0x18c vfs_write+0x1cc/0x220 ksys_write+0x74/0x100 __arm64_sys_write+0x28/0x3c invoke_syscall.constprop.0+0x58/0xf0 do_el0_svc+0x70/0x170 el0_svc+0x54/0x190 el0t_64_sync_handler+0xa4/0x130 el0t_64_sync+0x1a0/0x1a4 irq event stamp: 189006 hardirqs last enabled at (189005): [] finish_task_switch.isra.0+0xe0/0x2c0 hardirqs last disabled at (189006): [] el1_dbg+0x24/0xa0 softirqs last enabled at (188966): [] __do_softirq+0x4b0/0x6a0 softirqs last disabled at (188957): [] __irq_exit_rcu+0x108/0x1a4 [ rjw: Because can be freed by the gov_attr_set_put() in cpufreq_dbs_governor_exit() now, it is also necessary to put the invocation of the governor ->exit() callback into the new cpufreq_dbs_data_release() function. ] Fixes: c4435630361d ("cpufreq: governor: New sysfs show/store callbacks for governor tunables") Signed-off-by: Kevin Hao Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_governor.c | 20 +++++++++++++------- drivers/cpufreq/cpufreq_governor.h | 1 + 2 files changed, 14 insertions(+), 7 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index 0d42cf8b88d8..85da677c43d6 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -388,6 +388,15 @@ static void free_policy_dbs_info(struct policy_dbs_info *policy_dbs, gov->free(policy_dbs); } +static void cpufreq_dbs_data_release(struct kobject *kobj) +{ + struct dbs_data *dbs_data = to_dbs_data(to_gov_attr_set(kobj)); + struct dbs_governor *gov = dbs_data->gov; + + gov->exit(dbs_data); + kfree(dbs_data); +} + int cpufreq_dbs_governor_init(struct cpufreq_policy *policy) { struct dbs_governor *gov = dbs_governor_of(policy); @@ -425,6 +434,7 @@ int cpufreq_dbs_governor_init(struct cpufreq_policy *policy) goto free_policy_dbs_info; } + dbs_data->gov = gov; gov_attr_set_init(&dbs_data->attr_set, &policy_dbs->list); ret = gov->init(dbs_data); @@ -447,6 +457,7 @@ int cpufreq_dbs_governor_init(struct cpufreq_policy *policy) policy->governor_data = policy_dbs; gov->kobj_type.sysfs_ops = &governor_sysfs_ops; + gov->kobj_type.release = cpufreq_dbs_data_release; ret = kobject_init_and_add(&dbs_data->attr_set.kobj, &gov->kobj_type, get_governor_parent_kobj(policy), "%s", gov->gov.name); @@ -488,13 +499,8 @@ void cpufreq_dbs_governor_exit(struct cpufreq_policy *policy) policy->governor_data = NULL; - if (!count) { - if (!have_governor_per_policy()) - gov->gdbs_data = NULL; - - gov->exit(dbs_data); - kfree(dbs_data); - } + if (!count && !have_governor_per_policy()) + gov->gdbs_data = NULL; free_policy_dbs_info(policy_dbs, gov); diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h index a5a0bc3cc23e..168c23fd7fca 100644 --- a/drivers/cpufreq/cpufreq_governor.h +++ b/drivers/cpufreq/cpufreq_governor.h @@ -37,6 +37,7 @@ enum {OD_NORMAL_SAMPLE, OD_SUB_SAMPLE}; /* Governor demand based switching data (per-policy or global). */ struct dbs_data { struct gov_attr_set attr_set; + struct dbs_governor *gov; void *tuners; unsigned int ignore_nice_load; unsigned int sampling_rate; -- cgit v1.2.3 From 95996a675757a0f3f75babb8cd4f2ebcd0bda478 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 1 Apr 2022 19:24:28 +0200 Subject: cpufreq: Prepare cleanup of powerpc's asm/prom.h powerpc's asm/prom.h brings some headers that it doesn't need itself. In order to clean it up, first add missing headers in users of asm/prom.h Signed-off-by: Christophe Leroy Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/pasemi-cpufreq.c | 1 - drivers/cpufreq/pmac32-cpufreq.c | 2 +- drivers/cpufreq/pmac64-cpufreq.c | 2 +- drivers/cpufreq/ppc_cbe_cpufreq.c | 1 - drivers/cpufreq/ppc_cbe_cpufreq_pmi.c | 2 +- 5 files changed, 3 insertions(+), 5 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/pasemi-cpufreq.c b/drivers/cpufreq/pasemi-cpufreq.c index 815645170c4d..039a66bbe1be 100644 --- a/drivers/cpufreq/pasemi-cpufreq.c +++ b/drivers/cpufreq/pasemi-cpufreq.c @@ -18,7 +18,6 @@ #include #include -#include #include #include diff --git a/drivers/cpufreq/pmac32-cpufreq.c b/drivers/cpufreq/pmac32-cpufreq.c index 4f20c6a9108d..20f64a8b0a35 100644 --- a/drivers/cpufreq/pmac32-cpufreq.c +++ b/drivers/cpufreq/pmac32-cpufreq.c @@ -24,7 +24,7 @@ #include #include #include -#include + #include #include #include diff --git a/drivers/cpufreq/pmac64-cpufreq.c b/drivers/cpufreq/pmac64-cpufreq.c index d7542a106e6b..ba9c31d98bd6 100644 --- a/drivers/cpufreq/pmac64-cpufreq.c +++ b/drivers/cpufreq/pmac64-cpufreq.c @@ -22,7 +22,7 @@ #include #include #include -#include + #include #include #include diff --git a/drivers/cpufreq/ppc_cbe_cpufreq.c b/drivers/cpufreq/ppc_cbe_cpufreq.c index c58abb4cca3a..e3313ce63b38 100644 --- a/drivers/cpufreq/ppc_cbe_cpufreq.c +++ b/drivers/cpufreq/ppc_cbe_cpufreq.c @@ -12,7 +12,6 @@ #include #include -#include #include #include "ppc_cbe_cpufreq.h" diff --git a/drivers/cpufreq/ppc_cbe_cpufreq_pmi.c b/drivers/cpufreq/ppc_cbe_cpufreq_pmi.c index 037fe23bc6ed..4fba3637b115 100644 --- a/drivers/cpufreq/ppc_cbe_cpufreq_pmi.c +++ b/drivers/cpufreq/ppc_cbe_cpufreq_pmi.c @@ -13,9 +13,9 @@ #include #include #include +#include #include -#include #include #include -- cgit v1.2.3 From addca285120b0edf2fef795f7809c83774cf74b7 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Fri, 8 Apr 2022 07:42:58 +0800 Subject: cpufreq: intel_pstate: Handle no_turbo in frequency invariance Problem statement: Once the user has disabled turbo frequency by # echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo the cfs_rq's util_avg becomes quite small when compared with CPU capacity. Step to reproduce: # echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo # ./x86_cpuload --count 1 --start 3 --timeout 100 --busy 99 would launch 1 thread and bind it to CPU3, lasting for 100 seconds, with a CPU utilization of 99%. [1] top result: %Cpu3 : 98.4 us, 0.0 sy, 0.0 ni, 1.6 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st check util_avg: cat /sys/kernel/debug/sched/debug | grep "cfs_rq\[3\]" -A 20 | grep util_avg .util_avg : 611 So the util_avg/cpu capacity is 611/1024, which is much smaller than 98.4% shown in the top result. This might impact some logic in the scheduler. For example, group_is_overloaded() would compare the group_capacity and group_util in the sched group, to check if this sched group is overloaded or not. With this gap, even when there is a nearly 100% workload, the sched group will not be regarded as overloaded. Besides group_is_overloaded(), there are also other victims. There is a ongoing work that aims to optimize the task wakeup in a LLC domain. The main idea is to stop searching idle CPUs if the sched domain is overloaded[2]. This proposal also relies on the util_avg/CPU capacity to decide whether the LLC domain is overloaded. Analysis: CPU frequency invariance has caused this difference. In summary, the util_sum of cfs rq would decay quite fast when the CPU is in idle, when the CPU frequency invariance is enabled. The detail is as followed: As depicted in update_rq_clock_pelt(), when the frequency invariance is enabled, there would be two clock variables on each rq, clock_task and clock_pelt: The clock_pelt scales the time to reflect the effective amount of computation done during the running delta time but then syncs back to clock_task when rq is idle. absolute time | 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|16 @ max frequency ------******---------------******--------------- @ half frequency ------************---------************--------- clock pelt | 1| 2| 3| 4| 7| 8| 9| 10| 11|14|15|16 The fast decay of util_sum during idle is due to: 1. rq->clock_pelt is always behind rq->clock_task 2. rq->last_update is updated to rq->clock_pelt' after invoking ___update_load_sum() 3. Then the CPU becomes idle, the rq->clock_pelt' would be suddenly increased a lot to rq->clock_task 4. Enters ___update_load_sum() again, the idle period is calculated by rq->clock_task - rq->last_update, AKA, rq->clock_task - rq->clock_pelt'. The lower the CPU frequency is, the larger the delta = rq->clock_task - rq->clock_pelt' will be. Since the idle period will be used to decay the util_sum only, the util_sum drops significantly during idle period. Proposal: This symptom is not only caused by disabling turbo frequency, but it would also appear if the user limits the max frequency at runtime. Because, if the frequency is always lower than the max frequency, CPU frequency invariance would decay the util_sum quite fast during idle. As some end users would disable turbo after boot up, this patch aims to present this symptom and deals with turbo scenarios for now. It might be ideal if CPU frequency invariance is aware of the max CPU frequency (user specified) at runtime in the future. Link: https://github.com/yu-chen-surf/x86_cpuload.git #1 Link: https://lore.kernel.org/lkml/20220310005228.11737-1-yu.c.chen@intel.com/ #2 Signed-off-by: Chen Yu Acked-by: Peter Zijlstra (Intel) Reviewed-by: Giovanni Gherdovich Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 846bb3a78788..2216b24b6f84 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1322,6 +1322,7 @@ static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b, mutex_unlock(&intel_pstate_limits_lock); intel_pstate_update_policies(); + arch_set_max_freq_ratio(global.no_turbo); mutex_unlock(&intel_pstate_driver_lock); -- cgit v1.2.3 From f346e96267cd76175d6c201b40f770c0116a8a04 Mon Sep 17 00:00:00 2001 From: Schspa Shi Date: Thu, 21 Apr 2022 03:15:41 +0800 Subject: cpufreq: Fix possible race in cpufreq online error path When cpufreq online fails, the policy->cpus mask is not cleared and policy->rwsem is released too early, so the driver can be invoked via the cpuinfo_cur_freq sysfs attribute while its ->offline() or ->exit() callbacks are being run. Take policy->clk as an example: static int cpufreq_online(unsigned int cpu) { ... // policy->cpus != 0 at this time down_write(&policy->rwsem); ret = cpufreq_add_dev_interface(policy); up_write(&policy->rwsem); return 0; out_destroy_policy: for_each_cpu(j, policy->real_cpus) remove_cpu_dev_symlink(policy, get_cpu_device(j)); up_write(&policy->rwsem); ... out_exit_policy: if (cpufreq_driver->exit) cpufreq_driver->exit(policy); clk_put(policy->clk); // policy->clk is a wild pointer ... ^ | Another process access __cpufreq_get cpufreq_verify_current_freq cpufreq_generic_get // acces wild pointer of policy->clk; | | out_offline_policy: | cpufreq_policy_free(policy); | // deleted here, and will wait for no body reference cpufreq_policy_put_kobj(policy); } Address this by modifying cpufreq_online() to release policy->rwsem in the error path after the driver callbacks have run and to clear policy->cpus before releasing the semaphore. Fixes: 7106e02baed4 ("cpufreq: release policy->rwsem on error") Signed-off-by: Schspa Shi [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 80f535cc8a75..0d58b0f8f3af 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1533,8 +1533,6 @@ out_destroy_policy: for_each_cpu(j, policy->real_cpus) remove_cpu_dev_symlink(policy, get_cpu_device(j)); - up_write(&policy->rwsem); - out_offline_policy: if (cpufreq_driver->offline) cpufreq_driver->offline(policy); @@ -1543,6 +1541,9 @@ out_exit_policy: if (cpufreq_driver->exit) cpufreq_driver->exit(policy); + cpumask_clear(policy->cpus); + up_write(&policy->rwsem); + out_free_policy: cpufreq_policy_free(policy); return ret; -- cgit v1.2.3