1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
|
// SPDX-License-Identifier: GPL-2.0
/*
* Timer events oriented CPU idle governor
*
* Copyright (C) 2018 Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*
* The idea of this governor is based on the observation that on many systems
* timer events are two or more orders of magnitude more frequent than any
* other interrupts, so they are likely to be the most significant source of CPU
* wakeups from idle states. Moreover, information about what happened in the
* (relatively recent) past can be used to estimate whether or not the deepest
* idle state with target residency within the time to the closest timer is
* likely to be suitable for the upcoming idle time of the CPU and, if not, then
* which of the shallower idle states to choose.
*
* Of course, non-timer wakeup sources are more important in some use cases and
* they can be covered by taking a few most recent idle time intervals of the
* CPU into account. However, even in that case it is not necessary to consider
* idle duration values greater than the time till the closest timer, as the
* patterns that they may belong to produce average values close enough to
* the time till the closest timer (sleep length) anyway.
*
* Thus this governor estimates whether or not the upcoming idle time of the CPU
* is likely to be significantly shorter than the sleep length and selects an
* idle state for it in accordance with that, as follows:
*
* - Find an idle state on the basis of the sleep length and state statistics
* collected over time:
*
* o Find the deepest idle state whose target residency is less than or equal
* to the sleep length.
*
* o Select it if it matched both the sleep length and the observed idle
* duration in the past more often than it matched the sleep length alone
* (i.e. the observed idle duration was significantly shorter than the sleep
* length matched by it).
*
* o Otherwise, select the shallower state with the greatest matched "early"
* wakeups metric.
*
* - If the majority of the most recent idle duration values are below the
* target residency of the idle state selected so far, use those values to
* compute the new expected idle duration and find an idle state matching it
* (which has to be shallower than the one selected so far).
*/
#include <linux/cpuidle.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
#include <linux/sched/clock.h>
#include <linux/tick.h>
/*
* The PULSE value is added to metrics when they grow and the DECAY_SHIFT value
* is used for decreasing metrics on a regular basis.
*/
#define PULSE 1024
#define DECAY_SHIFT 3
/*
* Number of the most recent idle duration values to take into consideration for
* the detection of wakeup patterns.
*/
#define INTERVALS 8
/**
* struct teo_idle_state - Idle state data used by the TEO cpuidle governor.
* @early_hits: "Early" CPU wakeups "matching" this state.
* @hits: "On time" CPU wakeups "matching" this state.
* @misses: CPU wakeups "missing" this state.
*
* A CPU wakeup is "matched" by a given idle state if the idle duration measured
* after the wakeup is between the target residency of that state and the target
* residency of the next one (or if this is the deepest available idle state, it
* "matches" a CPU wakeup when the measured idle duration is at least equal to
* its target residency).
*
* Also, from the TEO governor perspective, a CPU wakeup from idle is "early" if
* it occurs significantly earlier than the closest expected timer event (that
* is, early enough to match an idle state shallower than the one matching the
* time till the closest timer event). Otherwise, the wakeup is "on time", or
* it is a "hit".
*
* A "miss" occurs when the given state doesn't match the wakeup, but it matches
* the time till the closest timer event used for idle state selection.
*/
struct teo_idle_state {
unsigned int early_hits;
unsigned int hits;
unsigned int misses;
};
/**
* struct teo_cpu - CPU data used by the TEO cpuidle governor.
* @time_span_ns: Time between idle state selection and post-wakeup update.
* @sleep_length_ns: Time till the closest timer event (at the selection time).
* @states: Idle states data corresponding to this CPU.
* @interval_idx: Index of the most recent saved idle interval.
* @intervals: Saved idle duration values.
*/
struct teo_cpu {
u64 time_span_ns;
u64 sleep_length_ns;
struct teo_idle_state states[CPUIDLE_STATE_MAX];
int interval_idx;
unsigned int intervals[INTERVALS];
};
static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
/**
* teo_update - Update CPU data after wakeup.
* @drv: cpuidle driver containing state data.
* @dev: Target CPU.
*/
static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
{
struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
unsigned int sleep_length_us = ktime_to_us(cpu_data->sleep_length_ns);
int i, idx_hit = -1, idx_timer = -1;
unsigned int measured_us;
if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) {
/*
* One of the safety nets has triggered or the wakeup was close
* enough to the closest timer event expected at the idle state
* selection time to be discarded.
*/
measured_us = UINT_MAX;
} else {
unsigned int lat;
lat = drv->states[dev->last_state_idx].exit_latency;
measured_us = ktime_to_us(cpu_data->time_span_ns);
/*
* The delay between the wakeup and the first instruction
* executed by the CPU is not likely to be worst-case every
* time, so take 1/2 of the exit latency as a very rough
* approximation of the average of it.
*/
if (measured_us >= lat)
measured_us -= lat / 2;
else
measured_us /= 2;
}
/*
* Decay the "early hits" metric for all of the states and find the
* states matching the sleep length and the measured idle duration.
*/
for (i = 0; i < drv->state_count; i++) {
unsigned int early_hits = cpu_data->states[i].early_hits;
cpu_data->states[i].early_hits -= early_hits >> DECAY_SHIFT;
if (drv->states[i].target_residency <= sleep_length_us) {
idx_timer = i;
if (drv->states[i].target_residency <= measured_us)
idx_hit = i;
}
}
/*
* Update the "hits" and "misses" data for the state matching the sleep
* length. If it matches the measured idle duration too, this is a hit,
* so increase the "hits" metric for it then. Otherwise, this is a
* miss, so increase the "misses" metric for it. In the latter case
* also increase the "early hits" metric for the state that actually
* matches the measured idle duration.
*/
if (idx_timer >= 0) {
unsigned int hits = cpu_data->states[idx_timer].hits;
unsigned int misses = cpu_data->states[idx_timer].misses;
hits -= hits >> DECAY_SHIFT;
misses -= misses >> DECAY_SHIFT;
if (idx_timer > idx_hit) {
misses += PULSE;
if (idx_hit >= 0)
cpu_data->states[idx_hit].early_hits += PULSE;
} else {
hits += PULSE;
}
cpu_data->states[idx_timer].misses = misses;
cpu_data->states[idx_timer].hits = hits;
}
/*
* Save idle duration values corresponding to non-timer wakeups for
* pattern detection.
*/
cpu_data->intervals[cpu_data->interval_idx++] = measured_us;
if (cpu_data->interval_idx > INTERVALS)
cpu_data->interval_idx = 0;
}
/**
* teo_find_shallower_state - Find shallower idle state matching given duration.
* @drv: cpuidle driver containing state data.
* @dev: Target CPU.
* @state_idx: Index of the capping idle state.
* @duration_us: Idle duration value to match.
*/
static int teo_find_shallower_state(struct cpuidle_driver *drv,
struct cpuidle_device *dev, int state_idx,
unsigned int duration_us)
{
int i;
for (i = state_idx - 1; i >= 0; i--) {
if (drv->states[i].disabled || dev->states_usage[i].disable)
continue;
state_idx = i;
if (drv->states[i].target_residency <= duration_us)
break;
}
return state_idx;
}
/**
* teo_select - Selects the next idle state to enter.
* @drv: cpuidle driver containing state data.
* @dev: Target CPU.
* @stop_tick: Indication on whether or not to stop the scheduler tick.
*/
static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
bool *stop_tick)
{
struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
int latency_req = cpuidle_governor_latency_req(dev->cpu);
unsigned int duration_us, hits, misses, early_hits;
int max_early_idx, constraint_idx, idx, i;
ktime_t delta_tick;
if (dev->last_state_idx >= 0) {
teo_update(drv, dev);
dev->last_state_idx = -1;
}
cpu_data->time_span_ns = local_clock();
cpu_data->sleep_length_ns = tick_nohz_get_sleep_length(&delta_tick);
duration_us = ktime_to_us(cpu_data->sleep_length_ns);
hits = 0;
misses = 0;
early_hits = 0;
max_early_idx = -1;
constraint_idx = drv->state_count;
idx = -1;
for (i = 0; i < drv->state_count; i++) {
struct cpuidle_state *s = &drv->states[i];
struct cpuidle_state_usage *su = &dev->states_usage[i];
if (s->disabled || su->disable) {
/*
* Ignore disabled states with target residencies beyond
* the anticipated idle duration.
*/
if (s->target_residency > duration_us)
continue;
/*
* This state is disabled, so the range of idle duration
* values corresponding to it is covered by the current
* candidate state, but still the "hits" and "misses"
* metrics of the disabled state need to be used to
* decide whether or not the state covering the range in
* question is good enough.
*/
hits = cpu_data->states[i].hits;
misses = cpu_data->states[i].misses;
if (early_hits >= cpu_data->states[i].early_hits ||
idx < 0)
continue;
/*
* If the current candidate state has been the one with
* the maximum "early hits" metric so far, the "early
* hits" metric of the disabled state replaces the
* current "early hits" count to avoid selecting a
* deeper state with lower "early hits" metric.
*/
if (max_early_idx == idx) {
early_hits = cpu_data->states[i].early_hits;
continue;
}
/*
* The current candidate state is closer to the disabled
* one than the current maximum "early hits" state, so
* replace the latter with it, but in case the maximum
* "early hits" state index has not been set so far,
* check if the current candidate state is not too
* shallow for that role.
*/
if (!(tick_nohz_tick_stopped() &&
drv->states[idx].target_residency < TICK_USEC)) {
early_hits = cpu_data->states[i].early_hits;
max_early_idx = idx;
}
continue;
}
if (idx < 0) {
idx = i; /* first enabled state */
hits = cpu_data->states[i].hits;
misses = cpu_data->states[i].misses;
}
if (s->target_residency > duration_us)
break;
if (s->exit_latency > latency_req && constraint_idx > i)
constraint_idx = i;
idx = i;
hits = cpu_data->states[i].hits;
misses = cpu_data->states[i].misses;
if (early_hits < cpu_data->states[i].early_hits &&
!(tick_nohz_tick_stopped() &&
drv->states[i].target_residency < TICK_USEC)) {
early_hits = cpu_data->states[i].early_hits;
max_early_idx = i;
}
}
/*
* If the "hits" metric of the idle state matching the sleep length is
* greater than its "misses" metric, that is the one to use. Otherwise,
* it is more likely that one of the shallower states will match the
* idle duration observed after wakeup, so take the one with the maximum
* "early hits" metric, but if that cannot be determined, just use the
* state selected so far.
*/
if (hits <= misses && max_early_idx >= 0) {
idx = max_early_idx;
duration_us = drv->states[idx].target_residency;
}
/*
* If there is a latency constraint, it may be necessary to use a
* shallower idle state than the one selected so far.
*/
if (constraint_idx < idx)
idx = constraint_idx;
if (idx < 0) {
idx = 0; /* No states enabled. Must use 0. */
} else if (idx > 0) {
unsigned int count = 0;
u64 sum = 0;
/*
* Count and sum the most recent idle duration values less than
* the current expected idle duration value.
*/
for (i = 0; i < INTERVALS; i++) {
unsigned int val = cpu_data->intervals[i];
if (val >= duration_us)
continue;
count++;
sum += val;
}
/*
* Give up unless the majority of the most recent idle duration
* values are in the interesting range.
*/
if (count > INTERVALS / 2) {
unsigned int avg_us = div64_u64(sum, count);
/*
* Avoid spending too much time in an idle state that
* would be too shallow.
*/
if (!(tick_nohz_tick_stopped() && avg_us < TICK_USEC)) {
duration_us = avg_us;
if (drv->states[idx].target_residency > avg_us)
idx = teo_find_shallower_state(drv, dev,
idx, avg_us);
}
}
}
/*
* Don't stop the tick if the selected state is a polling one or if the
* expected idle duration is shorter than the tick period length.
*/
if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
duration_us < TICK_USEC) && !tick_nohz_tick_stopped()) {
unsigned int delta_tick_us = ktime_to_us(delta_tick);
*stop_tick = false;
/*
* The tick is not going to be stopped, so if the target
* residency of the state to be returned is not within the time
* till the closest timer including the tick, try to correct
* that.
*/
if (idx > 0 && drv->states[idx].target_residency > delta_tick_us)
idx = teo_find_shallower_state(drv, dev, idx, delta_tick_us);
}
return idx;
}
/**
* teo_reflect - Note that governor data for the CPU need to be updated.
* @dev: Target CPU.
* @state: Entered state.
*/
static void teo_reflect(struct cpuidle_device *dev, int state)
{
struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
dev->last_state_idx = state;
/*
* If the wakeup was not "natural", but triggered by one of the safety
* nets, assume that the CPU might have been idle for the entire sleep
* length time.
*/
if (dev->poll_time_limit ||
(tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) {
dev->poll_time_limit = false;
cpu_data->time_span_ns = cpu_data->sleep_length_ns;
} else {
cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns;
}
}
/**
* teo_enable_device - Initialize the governor's data for the target CPU.
* @drv: cpuidle driver (not used).
* @dev: Target CPU.
*/
static int teo_enable_device(struct cpuidle_driver *drv,
struct cpuidle_device *dev)
{
struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
int i;
memset(cpu_data, 0, sizeof(*cpu_data));
for (i = 0; i < INTERVALS; i++)
cpu_data->intervals[i] = UINT_MAX;
return 0;
}
static struct cpuidle_governor teo_governor = {
.name = "teo",
.rating = 19,
.enable = teo_enable_device,
.select = teo_select,
.reflect = teo_reflect,
};
static int __init teo_governor_init(void)
{
return cpuidle_register_governor(&teo_governor);
}
postcore_initcall(teo_governor_init);
|