diff options
Diffstat (limited to 'drivers/cpuidle')
-rw-r--r-- | drivers/cpuidle/Kconfig | 1 | ||||
-rw-r--r-- | drivers/cpuidle/Kconfig.arm | 10 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle-arm.c | 4 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle-big_little.c | 12 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle-haltpoll.c | 2 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle-mvebu-v7.c | 15 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle-psci-domain.c | 7 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle-psci.c | 25 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle-qcom-spm.c | 4 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle-riscv-sbi.c | 19 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle-tegra.c | 31 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle.c | 72 | ||||
-rw-r--r-- | drivers/cpuidle/driver.c | 4 | ||||
-rw-r--r-- | drivers/cpuidle/dt_idle_states.c | 2 | ||||
-rw-r--r-- | drivers/cpuidle/governors/teo.c | 102 | ||||
-rw-r--r-- | drivers/cpuidle/poll_state.c | 8 | ||||
-rw-r--r-- | drivers/cpuidle/sysfs.c | 6 |
17 files changed, 244 insertions, 80 deletions
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig index ff71dd662880..cac5997dca50 100644 --- a/drivers/cpuidle/Kconfig +++ b/drivers/cpuidle/Kconfig @@ -74,6 +74,7 @@ endmenu config HALTPOLL_CPUIDLE tristate "Halt poll cpuidle driver" depends on X86 && KVM_GUEST + select CPU_IDLE_GOV_HALTPOLL default y help This option enables halt poll cpuidle driver, which allows to poll diff --git a/drivers/cpuidle/Kconfig.arm b/drivers/cpuidle/Kconfig.arm index 747aa537389b..a1ee475d180d 100644 --- a/drivers/cpuidle/Kconfig.arm +++ b/drivers/cpuidle/Kconfig.arm @@ -24,6 +24,14 @@ config ARM_PSCI_CPUIDLE It provides an idle driver that is capable of detecting and managing idle states through the PSCI firmware interface. + The driver has limitations when used with PREEMPT_RT: + - If the idle states are described with the non-hierarchical layout, + all idle states are still available. + + - If the idle states are described with the hierarchical layout, + only the idle states defined per CPU are available, but not the ones + being shared among a group of CPUs (aka cluster idle states). + config ARM_PSCI_CPUIDLE_DOMAIN bool "PSCI CPU idle Domain" depends on ARM_PSCI_CPUIDLE @@ -102,6 +110,7 @@ config ARM_MVEBU_V7_CPUIDLE config ARM_TEGRA_CPUIDLE bool "CPU Idle Driver for NVIDIA Tegra SoCs" depends on (ARCH_TEGRA || COMPILE_TEST) && !ARM64 && MMU + depends on ARCH_SUSPEND_POSSIBLE select ARCH_NEEDS_CPU_IDLE_COUPLED if SMP select ARM_CPU_SUSPEND help @@ -110,6 +119,7 @@ config ARM_TEGRA_CPUIDLE config ARM_QCOM_SPM_CPUIDLE bool "CPU Idle Driver for Qualcomm Subsystem Power Manager (SPM)" depends on (ARCH_QCOM || COMPILE_TEST) && !ARM64 && MMU + depends on ARCH_SUSPEND_POSSIBLE select ARM_CPU_SUSPEND select CPU_IDLE_MULTIPLE_DRIVERS select DT_IDLE_STATES diff --git a/drivers/cpuidle/cpuidle-arm.c b/drivers/cpuidle/cpuidle-arm.c index 8c758920d699..7cfb980a357d 100644 --- a/drivers/cpuidle/cpuidle-arm.c +++ b/drivers/cpuidle/cpuidle-arm.c @@ -31,8 +31,8 @@ * Called from the CPUidle framework to program the device to the * specified target state selected by the governor. */ -static int arm_enter_idle_state(struct cpuidle_device *dev, - struct cpuidle_driver *drv, int idx) +static __cpuidle int arm_enter_idle_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx) { /* * Pass idle state index to arm_cpuidle_suspend which in turn diff --git a/drivers/cpuidle/cpuidle-big_little.c b/drivers/cpuidle/cpuidle-big_little.c index abe51185f243..74972deda0ea 100644 --- a/drivers/cpuidle/cpuidle-big_little.c +++ b/drivers/cpuidle/cpuidle-big_little.c @@ -64,7 +64,8 @@ static struct cpuidle_driver bl_idle_little_driver = { .enter = bl_enter_powerdown, .exit_latency = 700, .target_residency = 2500, - .flags = CPUIDLE_FLAG_TIMER_STOP, + .flags = CPUIDLE_FLAG_TIMER_STOP | + CPUIDLE_FLAG_RCU_IDLE, .name = "C1", .desc = "ARM little-cluster power down", }, @@ -85,7 +86,8 @@ static struct cpuidle_driver bl_idle_big_driver = { .enter = bl_enter_powerdown, .exit_latency = 500, .target_residency = 2000, - .flags = CPUIDLE_FLAG_TIMER_STOP, + .flags = CPUIDLE_FLAG_TIMER_STOP | + CPUIDLE_FLAG_RCU_IDLE, .name = "C1", .desc = "ARM big-cluster power down", }, @@ -120,15 +122,17 @@ static int notrace bl_powerdown_finisher(unsigned long arg) * Called from the CPUidle framework to program the device to the * specified target state selected by the governor. */ -static int bl_enter_powerdown(struct cpuidle_device *dev, - struct cpuidle_driver *drv, int idx) +static __cpuidle int bl_enter_powerdown(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx) { cpu_pm_enter(); + ct_cpuidle_enter(); cpu_suspend(0, bl_powerdown_finisher); /* signals the MCPM core that CPU is out of low power state */ mcpm_cpu_powered_up(); + ct_cpuidle_exit(); cpu_pm_exit(); diff --git a/drivers/cpuidle/cpuidle-haltpoll.c b/drivers/cpuidle/cpuidle-haltpoll.c index 3a39a7f48b77..e66df22f9695 100644 --- a/drivers/cpuidle/cpuidle-haltpoll.c +++ b/drivers/cpuidle/cpuidle-haltpoll.c @@ -32,7 +32,7 @@ static int default_enter_idle(struct cpuidle_device *dev, local_irq_enable(); return index; } - default_idle(); + arch_cpu_idle(); return index; } diff --git a/drivers/cpuidle/cpuidle-mvebu-v7.c b/drivers/cpuidle/cpuidle-mvebu-v7.c index 01a856971f05..563dba609b98 100644 --- a/drivers/cpuidle/cpuidle-mvebu-v7.c +++ b/drivers/cpuidle/cpuidle-mvebu-v7.c @@ -25,9 +25,9 @@ static int (*mvebu_v7_cpu_suspend)(int); -static int mvebu_v7_enter_idle(struct cpuidle_device *dev, - struct cpuidle_driver *drv, - int index) +static __cpuidle int mvebu_v7_enter_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) { int ret; bool deepidle = false; @@ -36,7 +36,10 @@ static int mvebu_v7_enter_idle(struct cpuidle_device *dev, if (drv->states[index].flags & MVEBU_V7_FLAG_DEEP_IDLE) deepidle = true; + ct_cpuidle_enter(); ret = mvebu_v7_cpu_suspend(deepidle); + ct_cpuidle_exit(); + cpu_pm_exit(); if (ret) @@ -53,6 +56,7 @@ static struct cpuidle_driver armadaxp_idle_driver = { .exit_latency = 100, .power_usage = 50, .target_residency = 1000, + .flags = CPUIDLE_FLAG_RCU_IDLE, .name = "MV CPU IDLE", .desc = "CPU power down", }, @@ -61,7 +65,7 @@ static struct cpuidle_driver armadaxp_idle_driver = { .exit_latency = 1000, .power_usage = 5, .target_residency = 10000, - .flags = MVEBU_V7_FLAG_DEEP_IDLE, + .flags = MVEBU_V7_FLAG_DEEP_IDLE | CPUIDLE_FLAG_RCU_IDLE, .name = "MV CPU DEEP IDLE", .desc = "CPU and L2 Fabric power down", }, @@ -76,7 +80,7 @@ static struct cpuidle_driver armada370_idle_driver = { .exit_latency = 100, .power_usage = 5, .target_residency = 1000, - .flags = MVEBU_V7_FLAG_DEEP_IDLE, + .flags = MVEBU_V7_FLAG_DEEP_IDLE | CPUIDLE_FLAG_RCU_IDLE, .name = "Deep Idle", .desc = "CPU and L2 Fabric power down", }, @@ -91,6 +95,7 @@ static struct cpuidle_driver armada38x_idle_driver = { .exit_latency = 10, .power_usage = 5, .target_residency = 100, + .flags = CPUIDLE_FLAG_RCU_IDLE, .name = "Idle", .desc = "CPU and SCU power down", }, diff --git a/drivers/cpuidle/cpuidle-psci-domain.c b/drivers/cpuidle/cpuidle-psci-domain.c index c80cf9ddabd8..6ad2954948a5 100644 --- a/drivers/cpuidle/cpuidle-psci-domain.c +++ b/drivers/cpuidle/cpuidle-psci-domain.c @@ -64,8 +64,11 @@ static int psci_pd_init(struct device_node *np, bool use_osi) pd->flags |= GENPD_FLAG_IRQ_SAFE | GENPD_FLAG_CPU_DOMAIN; - /* Allow power off when OSI has been successfully enabled. */ - if (use_osi) + /* + * Allow power off when OSI has been successfully enabled. + * PREEMPT_RT is not yet ready to enter domain idle states. + */ + if (use_osi && !IS_ENABLED(CONFIG_PREEMPT_RT)) pd->power_off = psci_pd_power_off; else pd->flags |= GENPD_FLAG_ALWAYS_ON; diff --git a/drivers/cpuidle/cpuidle-psci.c b/drivers/cpuidle/cpuidle-psci.c index 57bc3e3ae391..6de027f9f6f5 100644 --- a/drivers/cpuidle/cpuidle-psci.c +++ b/drivers/cpuidle/cpuidle-psci.c @@ -49,14 +49,9 @@ static inline u32 psci_get_domain_state(void) return __this_cpu_read(domain_state); } -static inline int psci_enter_state(int idx, u32 state) -{ - return CPU_PM_CPU_IDLE_ENTER_PARAM(psci_cpu_suspend_enter, idx, state); -} - -static int __psci_enter_domain_idle_state(struct cpuidle_device *dev, - struct cpuidle_driver *drv, int idx, - bool s2idle) +static __cpuidle int __psci_enter_domain_idle_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx, + bool s2idle) { struct psci_cpuidle_data *data = this_cpu_ptr(&psci_cpuidle_data); u32 *states = data->psci_states; @@ -69,12 +64,10 @@ static int __psci_enter_domain_idle_state(struct cpuidle_device *dev, return -1; /* Do runtime PM to manage a hierarchical CPU toplogy. */ - ct_irq_enter_irqson(); if (s2idle) dev_pm_genpd_suspend(pd_dev); else pm_runtime_put_sync_suspend(pd_dev); - ct_irq_exit_irqson(); state = psci_get_domain_state(); if (!state) @@ -82,12 +75,10 @@ static int __psci_enter_domain_idle_state(struct cpuidle_device *dev, ret = psci_cpu_suspend_enter(state) ? -1 : idx; - ct_irq_enter_irqson(); if (s2idle) dev_pm_genpd_resume(pd_dev); else pm_runtime_get_sync(pd_dev); - ct_irq_exit_irqson(); cpu_pm_exit(); @@ -192,12 +183,12 @@ static void psci_idle_init_cpuhp(void) pr_warn("Failed %d while setup cpuhp state\n", err); } -static int psci_enter_idle_state(struct cpuidle_device *dev, - struct cpuidle_driver *drv, int idx) +static __cpuidle int psci_enter_idle_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx) { u32 *state = __this_cpu_read(psci_cpuidle_data.psci_states); - return psci_enter_state(idx, state[idx]); + return CPU_PM_CPU_IDLE_ENTER_PARAM_RCU(psci_cpu_suspend_enter, idx, state[idx]); } static const struct of_device_id psci_idle_state_match[] = { @@ -231,6 +222,9 @@ static int psci_dt_cpu_init_topology(struct cpuidle_driver *drv, if (!psci_has_osi_support()) return 0; + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return 0; + data->dev = psci_dt_attach_cpu(cpu); if (IS_ERR_OR_NULL(data->dev)) return PTR_ERR_OR_ZERO(data->dev); @@ -240,6 +234,7 @@ static int psci_dt_cpu_init_topology(struct cpuidle_driver *drv, * of a shared state for the domain, assumes the domain states are all * deeper states. */ + drv->states[state_count - 1].flags |= CPUIDLE_FLAG_RCU_IDLE; drv->states[state_count - 1].enter = psci_enter_domain_idle_state; drv->states[state_count - 1].enter_s2idle = psci_enter_s2idle_domain_idle_state; psci_cpuidle_use_cpuhp = true; diff --git a/drivers/cpuidle/cpuidle-qcom-spm.c b/drivers/cpuidle/cpuidle-qcom-spm.c index beedf22cbe78..326bca154ac7 100644 --- a/drivers/cpuidle/cpuidle-qcom-spm.c +++ b/drivers/cpuidle/cpuidle-qcom-spm.c @@ -58,8 +58,8 @@ static int qcom_cpu_spc(struct spm_driver_data *drv) return ret; } -static int spm_enter_idle_state(struct cpuidle_device *dev, - struct cpuidle_driver *drv, int idx) +static __cpuidle int spm_enter_idle_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx) { struct cpuidle_qcom_spm_data *data = container_of(drv, struct cpuidle_qcom_spm_data, cpuidle_driver); diff --git a/drivers/cpuidle/cpuidle-riscv-sbi.c b/drivers/cpuidle/cpuidle-riscv-sbi.c index 05fe2902df9a..be383f4b6855 100644 --- a/drivers/cpuidle/cpuidle-riscv-sbi.c +++ b/drivers/cpuidle/cpuidle-riscv-sbi.c @@ -93,8 +93,8 @@ static int sbi_suspend(u32 state) return sbi_suspend_finisher(state, 0, 0); } -static int sbi_cpuidle_enter_state(struct cpuidle_device *dev, - struct cpuidle_driver *drv, int idx) +static __cpuidle int sbi_cpuidle_enter_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx) { u32 *states = __this_cpu_read(sbi_cpuidle_data.states); u32 state = states[idx]; @@ -106,9 +106,9 @@ static int sbi_cpuidle_enter_state(struct cpuidle_device *dev, idx, state); } -static int __sbi_enter_domain_idle_state(struct cpuidle_device *dev, - struct cpuidle_driver *drv, int idx, - bool s2idle) +static __cpuidle int __sbi_enter_domain_idle_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx, + bool s2idle) { struct sbi_cpuidle_data *data = this_cpu_ptr(&sbi_cpuidle_data); u32 *states = data->states; @@ -121,12 +121,12 @@ static int __sbi_enter_domain_idle_state(struct cpuidle_device *dev, return -1; /* Do runtime PM to manage a hierarchical CPU toplogy. */ - ct_irq_enter_irqson(); if (s2idle) dev_pm_genpd_suspend(pd_dev); else pm_runtime_put_sync_suspend(pd_dev); - ct_irq_exit_irqson(); + + ct_cpuidle_enter(); if (sbi_is_domain_state_available()) state = sbi_get_domain_state(); @@ -135,12 +135,12 @@ static int __sbi_enter_domain_idle_state(struct cpuidle_device *dev, ret = sbi_suspend(state) ? -1 : idx; - ct_irq_enter_irqson(); + ct_cpuidle_exit(); + if (s2idle) dev_pm_genpd_resume(pd_dev); else pm_runtime_get_sync(pd_dev); - ct_irq_exit_irqson(); cpu_pm_exit(); @@ -251,6 +251,7 @@ static int sbi_dt_cpu_init_topology(struct cpuidle_driver *drv, * of a shared state for the domain, assumes the domain states are all * deeper states. */ + drv->states[state_count - 1].flags |= CPUIDLE_FLAG_RCU_IDLE; drv->states[state_count - 1].enter = sbi_enter_domain_idle_state; drv->states[state_count - 1].enter_s2idle = sbi_enter_s2idle_domain_idle_state; diff --git a/drivers/cpuidle/cpuidle-tegra.c b/drivers/cpuidle/cpuidle-tegra.c index 9845629aeb6d..b203a93deac5 100644 --- a/drivers/cpuidle/cpuidle-tegra.c +++ b/drivers/cpuidle/cpuidle-tegra.c @@ -160,8 +160,8 @@ static int tegra_cpuidle_coupled_barrier(struct cpuidle_device *dev) return 0; } -static int tegra_cpuidle_state_enter(struct cpuidle_device *dev, - int index, unsigned int cpu) +static __cpuidle int tegra_cpuidle_state_enter(struct cpuidle_device *dev, + int index, unsigned int cpu) { int err; @@ -180,9 +180,11 @@ static int tegra_cpuidle_state_enter(struct cpuidle_device *dev, } local_fiq_disable(); - RCU_NONIDLE(tegra_pm_set_cpu_in_lp2()); + tegra_pm_set_cpu_in_lp2(); cpu_pm_enter(); + ct_cpuidle_enter(); + switch (index) { case TEGRA_C7: err = tegra_cpuidle_c7_enter(); @@ -197,8 +199,10 @@ static int tegra_cpuidle_state_enter(struct cpuidle_device *dev, break; } + ct_cpuidle_exit(); + cpu_pm_exit(); - RCU_NONIDLE(tegra_pm_clear_cpu_in_lp2()); + tegra_pm_clear_cpu_in_lp2(); local_fiq_enable(); return err ?: index; @@ -222,10 +226,11 @@ static int tegra_cpuidle_adjust_state_index(int index, unsigned int cpu) return index; } -static int tegra_cpuidle_enter(struct cpuidle_device *dev, - struct cpuidle_driver *drv, - int index) +static __cpuidle int tegra_cpuidle_enter(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) { + bool do_rcu = drv->states[index].flags & CPUIDLE_FLAG_RCU_IDLE; unsigned int cpu = cpu_logical_map(dev->cpu); int ret; @@ -233,9 +238,13 @@ static int tegra_cpuidle_enter(struct cpuidle_device *dev, if (dev->states_usage[index].disable) return -1; - if (index == TEGRA_C1) + if (index == TEGRA_C1) { + if (do_rcu) + ct_cpuidle_enter(); ret = arm_cpuidle_simple_enter(dev, drv, index); - else + if (do_rcu) + ct_cpuidle_exit(); + } else ret = tegra_cpuidle_state_enter(dev, index, cpu); if (ret < 0) { @@ -285,7 +294,8 @@ static struct cpuidle_driver tegra_idle_driver = { .exit_latency = 2000, .target_residency = 2200, .power_usage = 100, - .flags = CPUIDLE_FLAG_TIMER_STOP, + .flags = CPUIDLE_FLAG_TIMER_STOP | + CPUIDLE_FLAG_RCU_IDLE, .name = "C7", .desc = "CPU core powered off", }, @@ -295,6 +305,7 @@ static struct cpuidle_driver tegra_idle_driver = { .target_residency = 10000, .power_usage = 0, .flags = CPUIDLE_FLAG_TIMER_STOP | + CPUIDLE_FLAG_RCU_IDLE | CPUIDLE_FLAG_COUPLED, .name = "CC6", .desc = "CPU cluster powered off", diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 6eceb1988243..0b00f21cefe3 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -14,6 +14,7 @@ #include <linux/mutex.h> #include <linux/sched.h> #include <linux/sched/clock.h> +#include <linux/sched/idle.h> #include <linux/notifier.h> #include <linux/pm_qos.h> #include <linux/cpu.h> @@ -136,11 +137,13 @@ int cpuidle_find_deepest_state(struct cpuidle_driver *drv, } #ifdef CONFIG_SUSPEND -static void enter_s2idle_proper(struct cpuidle_driver *drv, - struct cpuidle_device *dev, int index) +static noinstr void enter_s2idle_proper(struct cpuidle_driver *drv, + struct cpuidle_device *dev, int index) { - ktime_t time_start, time_end; struct cpuidle_state *target_state = &drv->states[index]; + ktime_t time_start, time_end; + + instrumentation_begin(); time_start = ns_to_ktime(local_clock()); @@ -151,13 +154,18 @@ static void enter_s2idle_proper(struct cpuidle_driver *drv, * suspended is generally unsafe. */ stop_critical_timings(); - if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) - ct_idle_enter(); + if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) { + ct_cpuidle_enter(); + /* Annotate away the indirect call */ + instrumentation_begin(); + } target_state->enter_s2idle(dev, drv, index); if (WARN_ON_ONCE(!irqs_disabled())) - local_irq_disable(); - if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) - ct_idle_exit(); + raw_local_irq_disable(); + if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) { + instrumentation_end(); + ct_cpuidle_exit(); + } tick_unfreeze(); start_critical_timings(); @@ -165,6 +173,7 @@ static void enter_s2idle_proper(struct cpuidle_driver *drv, dev->states_usage[index].s2idle_time += ktime_us_delta(time_end, time_start); dev->states_usage[index].s2idle_usage++; + instrumentation_end(); } /** @@ -199,8 +208,9 @@ int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev) * @drv: cpuidle driver for this cpu * @index: index into the states table in @drv of the state to enter */ -int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, - int index) +noinstr int cpuidle_enter_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) { int entered_state; @@ -208,6 +218,8 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, bool broadcast = !!(target_state->flags & CPUIDLE_FLAG_TIMER_STOP); ktime_t time_start, time_end; + instrumentation_begin(); + /* * Tell the time framework to switch to a broadcast timer because our * local timer will be shut down. If a local timer is used from another @@ -234,11 +246,33 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, time_start = ns_to_ktime(local_clock()); stop_critical_timings(); - if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) - ct_idle_enter(); + if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) { + ct_cpuidle_enter(); + /* Annotate away the indirect call */ + instrumentation_begin(); + } + + /* + * NOTE!! + * + * For cpuidle_state::enter() methods that do *NOT* set + * CPUIDLE_FLAG_RCU_IDLE RCU will be disabled here and these functions + * must be marked either noinstr or __cpuidle. + * + * For cpuidle_state::enter() methods that *DO* set + * CPUIDLE_FLAG_RCU_IDLE this isn't required, but they must mark the + * function calling ct_cpuidle_enter() as noinstr/__cpuidle and all + * functions called within the RCU-idle region. + */ entered_state = target_state->enter(dev, drv, index); - if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) - ct_idle_exit(); + + if (WARN_ONCE(!irqs_disabled(), "%ps leaked IRQ state", target_state->enter)) + raw_local_irq_disable(); + + if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) { + instrumentation_end(); + ct_cpuidle_exit(); + } start_critical_timings(); sched_clock_idle_wakeup_event(); @@ -248,12 +282,8 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, /* The cpu is no longer idle or about to enter idle. */ sched_idle_set_state(NULL); - if (broadcast) { - if (WARN_ON_ONCE(!irqs_disabled())) - local_irq_disable(); - + if (broadcast) tick_broadcast_exit(); - } if (!cpuidle_state_is_coupled(drv, index)) local_irq_enable(); @@ -305,6 +335,8 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, dev->states_usage[index].rejected++; } + instrumentation_end(); + return entered_state; } @@ -394,7 +426,7 @@ void cpuidle_reflect(struct cpuidle_device *dev, int index) * @dev: the cpuidle device * */ -u64 cpuidle_poll_time(struct cpuidle_driver *drv, +__cpuidle u64 cpuidle_poll_time(struct cpuidle_driver *drv, struct cpuidle_device *dev) { int i; diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c index f70aa17e2a8e..d9cda7f6ccb9 100644 --- a/drivers/cpuidle/driver.c +++ b/drivers/cpuidle/driver.c @@ -183,11 +183,15 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv) s->target_residency_ns = s->target_residency * NSEC_PER_USEC; else if (s->target_residency_ns < 0) s->target_residency_ns = 0; + else + s->target_residency = div_u64(s->target_residency_ns, NSEC_PER_USEC); if (s->exit_latency > 0) s->exit_latency_ns = s->exit_latency * NSEC_PER_USEC; else if (s->exit_latency_ns < 0) s->exit_latency_ns = 0; + else + s->exit_latency = div_u64(s->exit_latency_ns, NSEC_PER_USEC); } } diff --git a/drivers/cpuidle/dt_idle_states.c b/drivers/cpuidle/dt_idle_states.c index 7ca3d7d9b5ea..02aa0b39af9d 100644 --- a/drivers/cpuidle/dt_idle_states.c +++ b/drivers/cpuidle/dt_idle_states.c @@ -77,7 +77,7 @@ static int init_state_node(struct cpuidle_state *idle_state, if (err) desc = state_node->name; - idle_state->flags = 0; + idle_state->flags = CPUIDLE_FLAG_RCU_IDLE; if (of_property_read_bool(state_node, "local-timer-stop")) idle_state->flags |= CPUIDLE_FLAG_TIMER_STOP; /* diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index d9262db79cae..987fc5f3997d 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -2,8 +2,13 @@ /* * Timer events oriented CPU idle governor * + * TEO governor: * Copyright (C) 2018 - 2021 Intel Corporation * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> + * + * Util-awareness mechanism: + * Copyright (C) 2022 Arm Ltd. + * Author: Kajetan Puchalski <kajetan.puchalski@arm.com> */ /** @@ -99,15 +104,56 @@ * select the given idle state instead of the candidate one. * * 3. By default, select the candidate state. + * + * Util-awareness mechanism: + * + * The idea behind the util-awareness extension is that there are two distinct + * scenarios for the CPU which should result in two different approaches to idle + * state selection - utilized and not utilized. + * + * In this case, 'utilized' means that the average runqueue util of the CPU is + * above a certain threshold. + * + * When the CPU is utilized while going into idle, more likely than not it will + * be woken up to do more work soon and so a shallower idle state should be + * selected to minimise latency and maximise performance. When the CPU is not + * being utilized, the usual metrics-based approach to selecting the deepest + * available idle state should be preferred to take advantage of the power + * saving. + * + * In order to achieve this, the governor uses a utilization threshold. + * The threshold is computed per-CPU as a percentage of the CPU's capacity + * by bit shifting the capacity value. Based on testing, the shift of 6 (~1.56%) + * seems to be getting the best results. + * + * Before selecting the next idle state, the governor compares the current CPU + * util to the precomputed util threshold. If it's below, it defaults to the + * TEO metrics mechanism. If it's above, the closest shallower idle state will + * be selected instead, as long as is not a polling state. */ #include <linux/cpuidle.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/sched.h> #include <linux/sched/clock.h> +#include <linux/sched/topology.h> #include <linux/tick.h> /* + * The number of bits to shift the CPU's capacity by in order to determine + * the utilized threshold. + * + * 6 was chosen based on testing as the number that achieved the best balance + * of power and performance on average. + * + * The resulting threshold is high enough to not be triggered by background + * noise and low enough to react quickly when activity starts to ramp up. + */ +#define UTIL_THRESHOLD_SHIFT 6 + + +/* * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value * is used for decreasing metrics on a regular basis. */ @@ -137,9 +183,11 @@ struct teo_bin { * @time_span_ns: Time between idle state selection and post-wakeup update. * @sleep_length_ns: Time till the closest timer event (at the selection time). * @state_bins: Idle state data bins for this CPU. - * @total: Grand total of the "intercepts" and "hits" mertics for all bins. + * @total: Grand total of the "intercepts" and "hits" metrics for all bins. * @next_recent_idx: Index of the next @recent_idx entry to update. * @recent_idx: Indices of bins corresponding to recent "intercepts". + * @util_threshold: Threshold above which the CPU is considered utilized + * @utilized: Whether the last sleep on the CPU happened while utilized */ struct teo_cpu { s64 time_span_ns; @@ -148,11 +196,30 @@ struct teo_cpu { unsigned int total; int next_recent_idx; int recent_idx[NR_RECENT]; + unsigned long util_threshold; + bool utilized; }; static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); /** + * teo_cpu_is_utilized - Check if the CPU's util is above the threshold + * @cpu: Target CPU + * @cpu_data: Governor CPU data for the target CPU + */ +#ifdef CONFIG_SMP +static bool teo_cpu_is_utilized(int cpu, struct teo_cpu *cpu_data) +{ + return sched_cpu_util(cpu) > cpu_data->util_threshold; +} +#else +static bool teo_cpu_is_utilized(int cpu, struct teo_cpu *cpu_data) +{ + return false; +} +#endif + +/** * teo_update - Update CPU metrics after wakeup. * @drv: cpuidle driver containing state data. * @dev: Target CPU. @@ -258,15 +325,17 @@ static s64 teo_middle_of_bin(int idx, struct cpuidle_driver *drv) * @dev: Target CPU. * @state_idx: Index of the capping idle state. * @duration_ns: Idle duration value to match. + * @no_poll: Don't consider polling states. */ static int teo_find_shallower_state(struct cpuidle_driver *drv, struct cpuidle_device *dev, int state_idx, - s64 duration_ns) + s64 duration_ns, bool no_poll) { int i; for (i = state_idx - 1; i >= 0; i--) { - if (dev->states_usage[i].disable) + if (dev->states_usage[i].disable || + (no_poll && drv->states[i].flags & CPUIDLE_FLAG_POLLING)) continue; state_idx = i; @@ -321,6 +390,22 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, goto end; } + cpu_data->utilized = teo_cpu_is_utilized(dev->cpu, cpu_data); + /* + * If the CPU is being utilized over the threshold and there are only 2 + * states to choose from, the metrics need not be considered, so choose + * the shallowest non-polling state and exit. + */ + if (drv->state_count < 3 && cpu_data->utilized) { + for (i = 0; i < drv->state_count; ++i) { + if (!dev->states_usage[i].disable && + !(drv->states[i].flags & CPUIDLE_FLAG_POLLING)) { + idx = i; + goto end; + } + } + } + /* * Find the deepest idle state whose target residency does not exceed * the current sleep length and the deepest idle state not deeper than @@ -452,6 +537,13 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, if (idx > constraint_idx) idx = constraint_idx; + /* + * If the CPU is being utilized over the threshold, choose a shallower + * non-polling state to improve latency + */ + if (cpu_data->utilized) + idx = teo_find_shallower_state(drv, dev, idx, duration_ns, true); + end: /* * Don't stop the tick if the selected state is a polling one or if the @@ -469,7 +561,7 @@ end: */ if (idx > idx0 && drv->states[idx].target_residency_ns > delta_tick) - idx = teo_find_shallower_state(drv, dev, idx, delta_tick); + idx = teo_find_shallower_state(drv, dev, idx, delta_tick, false); } return idx; @@ -508,9 +600,11 @@ static int teo_enable_device(struct cpuidle_driver *drv, struct cpuidle_device *dev) { struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); + unsigned long max_capacity = arch_scale_cpu_capacity(dev->cpu); int i; memset(cpu_data, 0, sizeof(*cpu_data)); + cpu_data->util_threshold = max_capacity >> UTIL_THRESHOLD_SHIFT; for (i = 0; i < NR_RECENT; i++) cpu_data->recent_idx[i] = -1; diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c index f7e83613ae94..bdcfeaecd228 100644 --- a/drivers/cpuidle/poll_state.c +++ b/drivers/cpuidle/poll_state.c @@ -13,11 +13,13 @@ static int __cpuidle poll_idle(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { - u64 time_start = local_clock(); + u64 time_start; + + time_start = local_clock(); dev->poll_time_limit = false; - local_irq_enable(); + raw_local_irq_enable(); if (!current_set_polling_and_test()) { unsigned int loop_count = 0; u64 limit; @@ -36,6 +38,8 @@ static int __cpuidle poll_idle(struct cpuidle_device *dev, } } } + raw_local_irq_disable(); + current_clr_polling(); return index; diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index 2b496a53cbca..48948b171749 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -200,7 +200,7 @@ static void cpuidle_sysfs_release(struct kobject *kobj) complete(&kdev->kobj_unregister); } -static struct kobj_type ktype_cpuidle = { +static const struct kobj_type ktype_cpuidle = { .sysfs_ops = &cpuidle_sysfs_ops, .release = cpuidle_sysfs_release, }; @@ -447,7 +447,7 @@ static void cpuidle_state_sysfs_release(struct kobject *kobj) complete(&state_obj->kobj_unregister); } -static struct kobj_type ktype_state_cpuidle = { +static const struct kobj_type ktype_state_cpuidle = { .sysfs_ops = &cpuidle_state_sysfs_ops, .default_groups = cpuidle_state_default_groups, .release = cpuidle_state_sysfs_release, @@ -594,7 +594,7 @@ static struct attribute *cpuidle_driver_default_attrs[] = { }; ATTRIBUTE_GROUPS(cpuidle_driver_default); -static struct kobj_type ktype_driver_cpuidle = { +static const struct kobj_type ktype_driver_cpuidle = { .sysfs_ops = &cpuidle_driver_sysfs_ops, .default_groups = cpuidle_driver_default_groups, .release = cpuidle_driver_sysfs_release, |