1 // SPDX-License-Identifier: GPL-2.0-only
3 * intel_idle.c - native hardware idle loop for modern Intel processors
5 * Copyright (c) 2013, Intel Corporation.
6 * Len Brown <len.brown@intel.com>
10 * intel_idle is a cpuidle driver that loads on specific Intel processors
11 * in lieu of the legacy ACPI processor_idle driver. The intent is to
12 * make Linux more efficient on these processors, as intel_idle knows
13 * more than ACPI, as well as make Linux more immune to ACPI BIOS bugs.
19 * All CPUs have same idle states as boot CPU
21 * Chipset BM_STS (bus master status) bit is a NOP
22 * for preventing entry into deep C-stats
28 * The driver currently initializes for_each_online_cpu() upon modprobe.
29 * It it unaware of subsequent processors hot-added to the system.
30 * This means that if you boot with maxcpus=n and later online
31 * processors above n, those processors will use C1 only.
33 * ACPI has a .suspend hack to turn off deep c-statees during suspend
34 * to avoid complications with the lapic timer workaround.
35 * Have not seen issues with suspend, but may need same workaround here.
39 /* un-comment DEBUG to enable pr_debug() statements */
42 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
44 #include <linux/kernel.h>
45 #include <linux/cpuidle.h>
46 #include <linux/tick.h>
47 #include <trace/events/power.h>
48 #include <linux/sched.h>
49 #include <linux/sched/smt.h>
50 #include <linux/notifier.h>
51 #include <linux/cpu.h>
52 #include <linux/moduleparam.h>
53 #include <asm/cpu_device_id.h>
54 #include <asm/intel-family.h>
55 #include <asm/nospec-branch.h>
56 #include <asm/mwait.h>
59 #define INTEL_IDLE_VERSION "0.4.1"
61 static struct cpuidle_driver intel_idle_driver = {
65 /* intel_idle.max_cstate=0 disables driver */
66 static int max_cstate = CPUIDLE_STATE_MAX - 1;
68 static unsigned int mwait_substates;
70 #define LAPIC_TIMER_ALWAYS_RELIABLE 0xFFFFFFFF
71 /* Reliable LAPIC Timer States, bit 1 for C1 etc. */
72 static unsigned int lapic_timer_reliable_states = (1 << 1); /* Default to only C1 */
75 struct cpuidle_state *state_table;
78 * Hardware C-state auto-demotion may not always be optimal.
79 * Indicate which enable bits to clear here.
81 unsigned long auto_demotion_disable_flags;
82 bool byt_auto_demotion_disable_flag;
83 bool disable_promotion_to_c1e;
86 static const struct idle_cpu *icpu;
87 static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
88 static int intel_idle(struct cpuidle_device *dev,
89 struct cpuidle_driver *drv, int index);
90 static void intel_idle_s2idle(struct cpuidle_device *dev,
91 struct cpuidle_driver *drv, int index);
92 static struct cpuidle_state *cpuidle_state_table;
95 * Set this flag for states where the HW flushes the TLB for us
96 * and so we don't need cross-calls to keep it consistent.
97 * If this flag is set, SW flushes the TLB, so even if the
98 * HW doesn't do the flushing, this flag is safe to use.
100 #define CPUIDLE_FLAG_TLB_FLUSHED 0x10000
103 * Disable IBRS across idle (when KERNEL_IBRS), is exclusive vs IRQ_ENABLE
106 #define CPUIDLE_FLAG_IBRS BIT(16)
109 * MWAIT takes an 8-bit "hint" in EAX "suggesting"
110 * the C-state (top nibble) and sub-state (bottom nibble)
111 * 0x00 means "MWAIT(C1)", 0x10 means "MWAIT(C2)" etc.
113 * We store the hint at the top of our "flags" for each state.
115 #define flg2MWAIT(flags) (((flags) >> 24) & 0xFF)
116 #define MWAIT2flg(eax) ((eax & 0xFF) << 24)
118 static __cpuidle int intel_idle_ibrs(struct cpuidle_device *dev,
119 struct cpuidle_driver *drv, int index)
121 bool smt_active = sched_smt_active();
122 u64 spec_ctrl = spec_ctrl_current();
126 wrmsrl(MSR_IA32_SPEC_CTRL, 0);
128 ret = intel_idle(dev, drv, index);
131 wrmsrl(MSR_IA32_SPEC_CTRL, spec_ctrl);
137 * States are indexed by the cstate number,
138 * which is also the index into the MWAIT hint array.
139 * Thus C0 is a dummy.
141 static struct cpuidle_state nehalem_cstates[] = {
144 .desc = "MWAIT 0x00",
145 .flags = MWAIT2flg(0x00),
147 .target_residency = 6,
148 .enter = &intel_idle,
149 .enter_s2idle = intel_idle_s2idle, },
152 .desc = "MWAIT 0x01",
153 .flags = MWAIT2flg(0x01),
155 .target_residency = 20,
156 .enter = &intel_idle,
157 .enter_s2idle = intel_idle_s2idle, },
160 .desc = "MWAIT 0x10",
161 .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
163 .target_residency = 80,
164 .enter = &intel_idle,
165 .enter_s2idle = intel_idle_s2idle, },
168 .desc = "MWAIT 0x20",
169 .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
171 .target_residency = 800,
172 .enter = &intel_idle,
173 .enter_s2idle = intel_idle_s2idle, },
178 static struct cpuidle_state snb_cstates[] = {
181 .desc = "MWAIT 0x00",
182 .flags = MWAIT2flg(0x00),
184 .target_residency = 2,
185 .enter = &intel_idle,
186 .enter_s2idle = intel_idle_s2idle, },
189 .desc = "MWAIT 0x01",
190 .flags = MWAIT2flg(0x01),
192 .target_residency = 20,
193 .enter = &intel_idle,
194 .enter_s2idle = intel_idle_s2idle, },
197 .desc = "MWAIT 0x10",
198 .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
200 .target_residency = 211,
201 .enter = &intel_idle,
202 .enter_s2idle = intel_idle_s2idle, },
205 .desc = "MWAIT 0x20",
206 .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
208 .target_residency = 345,
209 .enter = &intel_idle,
210 .enter_s2idle = intel_idle_s2idle, },
213 .desc = "MWAIT 0x30",
214 .flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TLB_FLUSHED,
216 .target_residency = 345,
217 .enter = &intel_idle,
218 .enter_s2idle = intel_idle_s2idle, },
223 static struct cpuidle_state byt_cstates[] = {
226 .desc = "MWAIT 0x00",
227 .flags = MWAIT2flg(0x00),
229 .target_residency = 1,
230 .enter = &intel_idle,
231 .enter_s2idle = intel_idle_s2idle, },
234 .desc = "MWAIT 0x58",
235 .flags = MWAIT2flg(0x58) | CPUIDLE_FLAG_TLB_FLUSHED,
237 .target_residency = 275,
238 .enter = &intel_idle,
239 .enter_s2idle = intel_idle_s2idle, },
242 .desc = "MWAIT 0x52",
243 .flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TLB_FLUSHED,
245 .target_residency = 560,
246 .enter = &intel_idle,
247 .enter_s2idle = intel_idle_s2idle, },
250 .desc = "MWAIT 0x60",
251 .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
252 .exit_latency = 1200,
253 .target_residency = 4000,
254 .enter = &intel_idle,
255 .enter_s2idle = intel_idle_s2idle, },
258 .desc = "MWAIT 0x64",
259 .flags = MWAIT2flg(0x64) | CPUIDLE_FLAG_TLB_FLUSHED,
260 .exit_latency = 10000,
261 .target_residency = 20000,
262 .enter = &intel_idle,
263 .enter_s2idle = intel_idle_s2idle, },
268 static struct cpuidle_state cht_cstates[] = {
271 .desc = "MWAIT 0x00",
272 .flags = MWAIT2flg(0x00),
274 .target_residency = 1,
275 .enter = &intel_idle,
276 .enter_s2idle = intel_idle_s2idle, },
279 .desc = "MWAIT 0x58",
280 .flags = MWAIT2flg(0x58) | CPUIDLE_FLAG_TLB_FLUSHED,
282 .target_residency = 275,
283 .enter = &intel_idle,
284 .enter_s2idle = intel_idle_s2idle, },
287 .desc = "MWAIT 0x52",
288 .flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TLB_FLUSHED,
290 .target_residency = 560,
291 .enter = &intel_idle,
292 .enter_s2idle = intel_idle_s2idle, },
295 .desc = "MWAIT 0x60",
296 .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
297 .exit_latency = 1200,
298 .target_residency = 4000,
299 .enter = &intel_idle,
300 .enter_s2idle = intel_idle_s2idle, },
303 .desc = "MWAIT 0x64",
304 .flags = MWAIT2flg(0x64) | CPUIDLE_FLAG_TLB_FLUSHED,
305 .exit_latency = 10000,
306 .target_residency = 20000,
307 .enter = &intel_idle,
308 .enter_s2idle = intel_idle_s2idle, },
313 static struct cpuidle_state ivb_cstates[] = {
316 .desc = "MWAIT 0x00",
317 .flags = MWAIT2flg(0x00),
319 .target_residency = 1,
320 .enter = &intel_idle,
321 .enter_s2idle = intel_idle_s2idle, },
324 .desc = "MWAIT 0x01",
325 .flags = MWAIT2flg(0x01),
327 .target_residency = 20,
328 .enter = &intel_idle,
329 .enter_s2idle = intel_idle_s2idle, },
332 .desc = "MWAIT 0x10",
333 .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
335 .target_residency = 156,
336 .enter = &intel_idle,
337 .enter_s2idle = intel_idle_s2idle, },
340 .desc = "MWAIT 0x20",
341 .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
343 .target_residency = 300,
344 .enter = &intel_idle,
345 .enter_s2idle = intel_idle_s2idle, },
348 .desc = "MWAIT 0x30",
349 .flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TLB_FLUSHED,
351 .target_residency = 300,
352 .enter = &intel_idle,
353 .enter_s2idle = intel_idle_s2idle, },
358 static struct cpuidle_state ivt_cstates[] = {
361 .desc = "MWAIT 0x00",
362 .flags = MWAIT2flg(0x00),
364 .target_residency = 1,
365 .enter = &intel_idle,
366 .enter_s2idle = intel_idle_s2idle, },
369 .desc = "MWAIT 0x01",
370 .flags = MWAIT2flg(0x01),
372 .target_residency = 80,
373 .enter = &intel_idle,
374 .enter_s2idle = intel_idle_s2idle, },
377 .desc = "MWAIT 0x10",
378 .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
380 .target_residency = 156,
381 .enter = &intel_idle,
382 .enter_s2idle = intel_idle_s2idle, },
385 .desc = "MWAIT 0x20",
386 .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
388 .target_residency = 300,
389 .enter = &intel_idle,
390 .enter_s2idle = intel_idle_s2idle, },
395 static struct cpuidle_state ivt_cstates_4s[] = {
398 .desc = "MWAIT 0x00",
399 .flags = MWAIT2flg(0x00),
401 .target_residency = 1,
402 .enter = &intel_idle,
403 .enter_s2idle = intel_idle_s2idle, },
406 .desc = "MWAIT 0x01",
407 .flags = MWAIT2flg(0x01),
409 .target_residency = 250,
410 .enter = &intel_idle,
411 .enter_s2idle = intel_idle_s2idle, },
414 .desc = "MWAIT 0x10",
415 .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
417 .target_residency = 300,
418 .enter = &intel_idle,
419 .enter_s2idle = intel_idle_s2idle, },
422 .desc = "MWAIT 0x20",
423 .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
425 .target_residency = 400,
426 .enter = &intel_idle,
427 .enter_s2idle = intel_idle_s2idle, },
432 static struct cpuidle_state ivt_cstates_8s[] = {
435 .desc = "MWAIT 0x00",
436 .flags = MWAIT2flg(0x00),
438 .target_residency = 1,
439 .enter = &intel_idle,
440 .enter_s2idle = intel_idle_s2idle, },
443 .desc = "MWAIT 0x01",
444 .flags = MWAIT2flg(0x01),
446 .target_residency = 500,
447 .enter = &intel_idle,
448 .enter_s2idle = intel_idle_s2idle, },
451 .desc = "MWAIT 0x10",
452 .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
454 .target_residency = 600,
455 .enter = &intel_idle,
456 .enter_s2idle = intel_idle_s2idle, },
459 .desc = "MWAIT 0x20",
460 .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
462 .target_residency = 700,
463 .enter = &intel_idle,
464 .enter_s2idle = intel_idle_s2idle, },
469 static struct cpuidle_state hsw_cstates[] = {
472 .desc = "MWAIT 0x00",
473 .flags = MWAIT2flg(0x00),
475 .target_residency = 2,
476 .enter = &intel_idle,
477 .enter_s2idle = intel_idle_s2idle, },
480 .desc = "MWAIT 0x01",
481 .flags = MWAIT2flg(0x01),
483 .target_residency = 20,
484 .enter = &intel_idle,
485 .enter_s2idle = intel_idle_s2idle, },
488 .desc = "MWAIT 0x10",
489 .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
491 .target_residency = 100,
492 .enter = &intel_idle,
493 .enter_s2idle = intel_idle_s2idle, },
496 .desc = "MWAIT 0x20",
497 .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
499 .target_residency = 400,
500 .enter = &intel_idle,
501 .enter_s2idle = intel_idle_s2idle, },
504 .desc = "MWAIT 0x32",
505 .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED,
507 .target_residency = 500,
508 .enter = &intel_idle,
509 .enter_s2idle = intel_idle_s2idle, },
512 .desc = "MWAIT 0x40",
513 .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
515 .target_residency = 900,
516 .enter = &intel_idle,
517 .enter_s2idle = intel_idle_s2idle, },
520 .desc = "MWAIT 0x50",
521 .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
523 .target_residency = 1800,
524 .enter = &intel_idle,
525 .enter_s2idle = intel_idle_s2idle, },
528 .desc = "MWAIT 0x60",
529 .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
530 .exit_latency = 2600,
531 .target_residency = 7700,
532 .enter = &intel_idle,
533 .enter_s2idle = intel_idle_s2idle, },
537 static struct cpuidle_state bdw_cstates[] = {
540 .desc = "MWAIT 0x00",
541 .flags = MWAIT2flg(0x00),
543 .target_residency = 2,
544 .enter = &intel_idle,
545 .enter_s2idle = intel_idle_s2idle, },
548 .desc = "MWAIT 0x01",
549 .flags = MWAIT2flg(0x01),
551 .target_residency = 20,
552 .enter = &intel_idle,
553 .enter_s2idle = intel_idle_s2idle, },
556 .desc = "MWAIT 0x10",
557 .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
559 .target_residency = 100,
560 .enter = &intel_idle,
561 .enter_s2idle = intel_idle_s2idle, },
564 .desc = "MWAIT 0x20",
565 .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
567 .target_residency = 400,
568 .enter = &intel_idle,
569 .enter_s2idle = intel_idle_s2idle, },
572 .desc = "MWAIT 0x32",
573 .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED,
575 .target_residency = 500,
576 .enter = &intel_idle,
577 .enter_s2idle = intel_idle_s2idle, },
580 .desc = "MWAIT 0x40",
581 .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
583 .target_residency = 900,
584 .enter = &intel_idle,
585 .enter_s2idle = intel_idle_s2idle, },
588 .desc = "MWAIT 0x50",
589 .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
591 .target_residency = 1800,
592 .enter = &intel_idle,
593 .enter_s2idle = intel_idle_s2idle, },
596 .desc = "MWAIT 0x60",
597 .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
598 .exit_latency = 2600,
599 .target_residency = 7700,
600 .enter = &intel_idle,
601 .enter_s2idle = intel_idle_s2idle, },
606 static struct cpuidle_state skl_cstates[] = {
609 .desc = "MWAIT 0x00",
610 .flags = MWAIT2flg(0x00),
612 .target_residency = 2,
613 .enter = &intel_idle,
614 .enter_s2idle = intel_idle_s2idle, },
617 .desc = "MWAIT 0x01",
618 .flags = MWAIT2flg(0x01),
620 .target_residency = 20,
621 .enter = &intel_idle,
622 .enter_s2idle = intel_idle_s2idle, },
625 .desc = "MWAIT 0x10",
626 .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
628 .target_residency = 100,
629 .enter = &intel_idle,
630 .enter_s2idle = intel_idle_s2idle, },
633 .desc = "MWAIT 0x20",
634 .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
636 .target_residency = 200,
637 .enter = &intel_idle,
638 .enter_s2idle = intel_idle_s2idle, },
641 .desc = "MWAIT 0x33",
642 .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
644 .target_residency = 800,
645 .enter = &intel_idle,
646 .enter_s2idle = intel_idle_s2idle, },
649 .desc = "MWAIT 0x40",
650 .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
652 .target_residency = 800,
653 .enter = &intel_idle,
654 .enter_s2idle = intel_idle_s2idle, },
657 .desc = "MWAIT 0x50",
658 .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
660 .target_residency = 5000,
661 .enter = &intel_idle,
662 .enter_s2idle = intel_idle_s2idle, },
665 .desc = "MWAIT 0x60",
666 .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
668 .target_residency = 5000,
669 .enter = &intel_idle,
670 .enter_s2idle = intel_idle_s2idle, },
675 static struct cpuidle_state skx_cstates[] = {
678 .desc = "MWAIT 0x00",
679 .flags = MWAIT2flg(0x00),
681 .target_residency = 2,
682 .enter = &intel_idle,
683 .enter_s2idle = intel_idle_s2idle, },
686 .desc = "MWAIT 0x01",
687 .flags = MWAIT2flg(0x01),
689 .target_residency = 20,
690 .enter = &intel_idle,
691 .enter_s2idle = intel_idle_s2idle, },
694 .desc = "MWAIT 0x20",
695 .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
697 .target_residency = 600,
698 .enter = &intel_idle,
699 .enter_s2idle = intel_idle_s2idle, },
704 static struct cpuidle_state atom_cstates[] = {
707 .desc = "MWAIT 0x00",
708 .flags = MWAIT2flg(0x00),
710 .target_residency = 20,
711 .enter = &intel_idle,
712 .enter_s2idle = intel_idle_s2idle, },
715 .desc = "MWAIT 0x10",
716 .flags = MWAIT2flg(0x10),
718 .target_residency = 80,
719 .enter = &intel_idle,
720 .enter_s2idle = intel_idle_s2idle, },
723 .desc = "MWAIT 0x30",
724 .flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TLB_FLUSHED,
726 .target_residency = 400,
727 .enter = &intel_idle,
728 .enter_s2idle = intel_idle_s2idle, },
731 .desc = "MWAIT 0x52",
732 .flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TLB_FLUSHED,
734 .target_residency = 560,
735 .enter = &intel_idle,
736 .enter_s2idle = intel_idle_s2idle, },
740 static struct cpuidle_state tangier_cstates[] = {
743 .desc = "MWAIT 0x00",
744 .flags = MWAIT2flg(0x00),
746 .target_residency = 4,
747 .enter = &intel_idle,
748 .enter_s2idle = intel_idle_s2idle, },
751 .desc = "MWAIT 0x30",
752 .flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TLB_FLUSHED,
754 .target_residency = 400,
755 .enter = &intel_idle,
756 .enter_s2idle = intel_idle_s2idle, },
759 .desc = "MWAIT 0x52",
760 .flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TLB_FLUSHED,
762 .target_residency = 560,
763 .enter = &intel_idle,
764 .enter_s2idle = intel_idle_s2idle, },
767 .desc = "MWAIT 0x60",
768 .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
769 .exit_latency = 1200,
770 .target_residency = 4000,
771 .enter = &intel_idle,
772 .enter_s2idle = intel_idle_s2idle, },
775 .desc = "MWAIT 0x64",
776 .flags = MWAIT2flg(0x64) | CPUIDLE_FLAG_TLB_FLUSHED,
777 .exit_latency = 10000,
778 .target_residency = 20000,
779 .enter = &intel_idle,
780 .enter_s2idle = intel_idle_s2idle, },
784 static struct cpuidle_state avn_cstates[] = {
787 .desc = "MWAIT 0x00",
788 .flags = MWAIT2flg(0x00),
790 .target_residency = 2,
791 .enter = &intel_idle,
792 .enter_s2idle = intel_idle_s2idle, },
795 .desc = "MWAIT 0x51",
796 .flags = MWAIT2flg(0x51) | CPUIDLE_FLAG_TLB_FLUSHED,
798 .target_residency = 45,
799 .enter = &intel_idle,
800 .enter_s2idle = intel_idle_s2idle, },
804 static struct cpuidle_state knl_cstates[] = {
807 .desc = "MWAIT 0x00",
808 .flags = MWAIT2flg(0x00),
810 .target_residency = 2,
811 .enter = &intel_idle,
812 .enter_s2idle = intel_idle_s2idle },
815 .desc = "MWAIT 0x10",
816 .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
818 .target_residency = 500,
819 .enter = &intel_idle,
820 .enter_s2idle = intel_idle_s2idle },
825 static struct cpuidle_state bxt_cstates[] = {
828 .desc = "MWAIT 0x00",
829 .flags = MWAIT2flg(0x00),
831 .target_residency = 2,
832 .enter = &intel_idle,
833 .enter_s2idle = intel_idle_s2idle, },
836 .desc = "MWAIT 0x01",
837 .flags = MWAIT2flg(0x01),
839 .target_residency = 20,
840 .enter = &intel_idle,
841 .enter_s2idle = intel_idle_s2idle, },
844 .desc = "MWAIT 0x20",
845 .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
847 .target_residency = 133,
848 .enter = &intel_idle,
849 .enter_s2idle = intel_idle_s2idle, },
852 .desc = "MWAIT 0x31",
853 .flags = MWAIT2flg(0x31) | CPUIDLE_FLAG_TLB_FLUSHED,
855 .target_residency = 155,
856 .enter = &intel_idle,
857 .enter_s2idle = intel_idle_s2idle, },
860 .desc = "MWAIT 0x40",
861 .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
862 .exit_latency = 1000,
863 .target_residency = 1000,
864 .enter = &intel_idle,
865 .enter_s2idle = intel_idle_s2idle, },
868 .desc = "MWAIT 0x50",
869 .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
870 .exit_latency = 2000,
871 .target_residency = 2000,
872 .enter = &intel_idle,
873 .enter_s2idle = intel_idle_s2idle, },
876 .desc = "MWAIT 0x60",
877 .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
878 .exit_latency = 10000,
879 .target_residency = 10000,
880 .enter = &intel_idle,
881 .enter_s2idle = intel_idle_s2idle, },
886 static struct cpuidle_state dnv_cstates[] = {
889 .desc = "MWAIT 0x00",
890 .flags = MWAIT2flg(0x00),
892 .target_residency = 2,
893 .enter = &intel_idle,
894 .enter_s2idle = intel_idle_s2idle, },
897 .desc = "MWAIT 0x01",
898 .flags = MWAIT2flg(0x01),
900 .target_residency = 20,
901 .enter = &intel_idle,
902 .enter_s2idle = intel_idle_s2idle, },
905 .desc = "MWAIT 0x20",
906 .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
908 .target_residency = 500,
909 .enter = &intel_idle,
910 .enter_s2idle = intel_idle_s2idle, },
917 * @dev: cpuidle_device
918 * @drv: cpuidle driver
919 * @index: index of cpuidle state
921 * Must be called under local_irq_disable().
923 static __cpuidle int intel_idle(struct cpuidle_device *dev,
924 struct cpuidle_driver *drv, int index)
926 unsigned long ecx = 1; /* break on interrupt flag */
927 struct cpuidle_state *state = &drv->states[index];
928 unsigned long eax = flg2MWAIT(state->flags);
930 bool uninitialized_var(tick);
931 int cpu = smp_processor_id();
934 * leave_mm() to avoid costly and often unnecessary wakeups
935 * for flushing the user TLB's associated with the active mm.
937 if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
940 if (!static_cpu_has(X86_FEATURE_ARAT)) {
941 cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) &
942 MWAIT_CSTATE_MASK) + 1;
944 if (!(lapic_timer_reliable_states & (1 << (cstate)))) {
946 tick_broadcast_enter();
950 mwait_idle_with_hints(eax, ecx);
952 if (!static_cpu_has(X86_FEATURE_ARAT) && tick)
953 tick_broadcast_exit();
959 * intel_idle_s2idle - simplified "enter" callback routine for suspend-to-idle
960 * @dev: cpuidle_device
961 * @drv: cpuidle driver
962 * @index: state index
964 static void intel_idle_s2idle(struct cpuidle_device *dev,
965 struct cpuidle_driver *drv, int index)
967 unsigned long ecx = 1; /* break on interrupt flag */
968 unsigned long eax = flg2MWAIT(drv->states[index].flags);
970 mwait_idle_with_hints(eax, ecx);
973 static void __setup_broadcast_timer(bool on)
976 tick_broadcast_enable();
978 tick_broadcast_disable();
981 static void auto_demotion_disable(void)
983 unsigned long long msr_bits;
985 rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits);
986 msr_bits &= ~(icpu->auto_demotion_disable_flags);
987 wrmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits);
989 static void c1e_promotion_disable(void)
991 unsigned long long msr_bits;
993 rdmsrl(MSR_IA32_POWER_CTL, msr_bits);
995 wrmsrl(MSR_IA32_POWER_CTL, msr_bits);
998 static const struct idle_cpu idle_cpu_nehalem = {
999 .state_table = nehalem_cstates,
1000 .auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE,
1001 .disable_promotion_to_c1e = true,
1004 static const struct idle_cpu idle_cpu_atom = {
1005 .state_table = atom_cstates,
1008 static const struct idle_cpu idle_cpu_tangier = {
1009 .state_table = tangier_cstates,
1012 static const struct idle_cpu idle_cpu_lincroft = {
1013 .state_table = atom_cstates,
1014 .auto_demotion_disable_flags = ATM_LNC_C6_AUTO_DEMOTE,
1017 static const struct idle_cpu idle_cpu_snb = {
1018 .state_table = snb_cstates,
1019 .disable_promotion_to_c1e = true,
1022 static const struct idle_cpu idle_cpu_byt = {
1023 .state_table = byt_cstates,
1024 .disable_promotion_to_c1e = true,
1025 .byt_auto_demotion_disable_flag = true,
1028 static const struct idle_cpu idle_cpu_cht = {
1029 .state_table = cht_cstates,
1030 .disable_promotion_to_c1e = true,
1031 .byt_auto_demotion_disable_flag = true,
1034 static const struct idle_cpu idle_cpu_ivb = {
1035 .state_table = ivb_cstates,
1036 .disable_promotion_to_c1e = true,
1039 static const struct idle_cpu idle_cpu_ivt = {
1040 .state_table = ivt_cstates,
1041 .disable_promotion_to_c1e = true,
1044 static const struct idle_cpu idle_cpu_hsw = {
1045 .state_table = hsw_cstates,
1046 .disable_promotion_to_c1e = true,
1049 static const struct idle_cpu idle_cpu_bdw = {
1050 .state_table = bdw_cstates,
1051 .disable_promotion_to_c1e = true,
1054 static const struct idle_cpu idle_cpu_skl = {
1055 .state_table = skl_cstates,
1056 .disable_promotion_to_c1e = true,
1059 static const struct idle_cpu idle_cpu_skx = {
1060 .state_table = skx_cstates,
1061 .disable_promotion_to_c1e = true,
1064 static const struct idle_cpu idle_cpu_avn = {
1065 .state_table = avn_cstates,
1066 .disable_promotion_to_c1e = true,
1069 static const struct idle_cpu idle_cpu_knl = {
1070 .state_table = knl_cstates,
1073 static const struct idle_cpu idle_cpu_bxt = {
1074 .state_table = bxt_cstates,
1075 .disable_promotion_to_c1e = true,
1078 static const struct idle_cpu idle_cpu_dnv = {
1079 .state_table = dnv_cstates,
1080 .disable_promotion_to_c1e = true,
1083 static const struct x86_cpu_id intel_idle_ids[] __initconst = {
1084 INTEL_CPU_FAM6(NEHALEM_EP, idle_cpu_nehalem),
1085 INTEL_CPU_FAM6(NEHALEM, idle_cpu_nehalem),
1086 INTEL_CPU_FAM6(NEHALEM_G, idle_cpu_nehalem),
1087 INTEL_CPU_FAM6(WESTMERE, idle_cpu_nehalem),
1088 INTEL_CPU_FAM6(WESTMERE_EP, idle_cpu_nehalem),
1089 INTEL_CPU_FAM6(NEHALEM_EX, idle_cpu_nehalem),
1090 INTEL_CPU_FAM6(ATOM_BONNELL, idle_cpu_atom),
1091 INTEL_CPU_FAM6(ATOM_BONNELL_MID, idle_cpu_lincroft),
1092 INTEL_CPU_FAM6(WESTMERE_EX, idle_cpu_nehalem),
1093 INTEL_CPU_FAM6(SANDYBRIDGE, idle_cpu_snb),
1094 INTEL_CPU_FAM6(SANDYBRIDGE_X, idle_cpu_snb),
1095 INTEL_CPU_FAM6(ATOM_SALTWELL, idle_cpu_atom),
1096 INTEL_CPU_FAM6(ATOM_SILVERMONT, idle_cpu_byt),
1097 INTEL_CPU_FAM6(ATOM_SILVERMONT_MID, idle_cpu_tangier),
1098 INTEL_CPU_FAM6(ATOM_AIRMONT, idle_cpu_cht),
1099 INTEL_CPU_FAM6(IVYBRIDGE, idle_cpu_ivb),
1100 INTEL_CPU_FAM6(IVYBRIDGE_X, idle_cpu_ivt),
1101 INTEL_CPU_FAM6(HASWELL, idle_cpu_hsw),
1102 INTEL_CPU_FAM6(HASWELL_X, idle_cpu_hsw),
1103 INTEL_CPU_FAM6(HASWELL_L, idle_cpu_hsw),
1104 INTEL_CPU_FAM6(HASWELL_G, idle_cpu_hsw),
1105 INTEL_CPU_FAM6(ATOM_SILVERMONT_D, idle_cpu_avn),
1106 INTEL_CPU_FAM6(BROADWELL, idle_cpu_bdw),
1107 INTEL_CPU_FAM6(BROADWELL_G, idle_cpu_bdw),
1108 INTEL_CPU_FAM6(BROADWELL_X, idle_cpu_bdw),
1109 INTEL_CPU_FAM6(BROADWELL_D, idle_cpu_bdw),
1110 INTEL_CPU_FAM6(SKYLAKE_L, idle_cpu_skl),
1111 INTEL_CPU_FAM6(SKYLAKE, idle_cpu_skl),
1112 INTEL_CPU_FAM6(KABYLAKE_L, idle_cpu_skl),
1113 INTEL_CPU_FAM6(KABYLAKE, idle_cpu_skl),
1114 INTEL_CPU_FAM6(SKYLAKE_X, idle_cpu_skx),
1115 INTEL_CPU_FAM6(XEON_PHI_KNL, idle_cpu_knl),
1116 INTEL_CPU_FAM6(XEON_PHI_KNM, idle_cpu_knl),
1117 INTEL_CPU_FAM6(ATOM_GOLDMONT, idle_cpu_bxt),
1118 INTEL_CPU_FAM6(ATOM_GOLDMONT_PLUS, idle_cpu_bxt),
1119 INTEL_CPU_FAM6(ATOM_GOLDMONT_D, idle_cpu_dnv),
1120 INTEL_CPU_FAM6(ATOM_TREMONT_D, idle_cpu_dnv),
1125 * intel_idle_probe()
1127 static int __init intel_idle_probe(void)
1129 unsigned int eax, ebx, ecx;
1130 const struct x86_cpu_id *id;
1132 if (max_cstate == 0) {
1133 pr_debug("disabled\n");
1137 id = x86_match_cpu(intel_idle_ids);
1139 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
1140 boot_cpu_data.x86 == 6)
1141 pr_debug("does not run on family %d model %d\n",
1142 boot_cpu_data.x86, boot_cpu_data.x86_model);
1146 if (!boot_cpu_has(X86_FEATURE_MWAIT)) {
1147 pr_debug("Please enable MWAIT in BIOS SETUP\n");
1151 if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
1154 cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates);
1156 if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
1157 !(ecx & CPUID5_ECX_INTERRUPT_BREAK) ||
1161 pr_debug("MWAIT substates: 0x%x\n", mwait_substates);
1163 icpu = (const struct idle_cpu *)id->driver_data;
1164 cpuidle_state_table = icpu->state_table;
1166 pr_debug("v" INTEL_IDLE_VERSION " model 0x%X\n",
1167 boot_cpu_data.x86_model);
1173 * intel_idle_cpuidle_devices_uninit()
1174 * Unregisters the cpuidle devices.
1176 static void intel_idle_cpuidle_devices_uninit(void)
1179 struct cpuidle_device *dev;
1181 for_each_online_cpu(i) {
1182 dev = per_cpu_ptr(intel_idle_cpuidle_devices, i);
1183 cpuidle_unregister_device(dev);
1188 * ivt_idle_state_table_update(void)
1190 * Tune IVT multi-socket targets
1191 * Assumption: num_sockets == (max_package_num + 1)
1193 static void ivt_idle_state_table_update(void)
1195 /* IVT uses a different table for 1-2, 3-4, and > 4 sockets */
1196 int cpu, package_num, num_sockets = 1;
1198 for_each_online_cpu(cpu) {
1199 package_num = topology_physical_package_id(cpu);
1200 if (package_num + 1 > num_sockets) {
1201 num_sockets = package_num + 1;
1203 if (num_sockets > 4) {
1204 cpuidle_state_table = ivt_cstates_8s;
1210 if (num_sockets > 2)
1211 cpuidle_state_table = ivt_cstates_4s;
1213 /* else, 1 and 2 socket systems use default ivt_cstates */
1217 * Translate IRTL (Interrupt Response Time Limit) MSR to usec
1220 static unsigned int irtl_ns_units[] = {
1221 1, 32, 1024, 32768, 1048576, 33554432, 0, 0 };
1223 static unsigned long long irtl_2_usec(unsigned long long irtl)
1225 unsigned long long ns;
1230 ns = irtl_ns_units[(irtl >> 10) & 0x7];
1232 return div64_u64((irtl & 0x3FF) * ns, 1000);
1235 * bxt_idle_state_table_update(void)
1237 * On BXT, we trust the IRTL to show the definitive maximum latency
1238 * We use the same value for target_residency.
1240 static void bxt_idle_state_table_update(void)
1242 unsigned long long msr;
1245 rdmsrl(MSR_PKGC6_IRTL, msr);
1246 usec = irtl_2_usec(msr);
1248 bxt_cstates[2].exit_latency = usec;
1249 bxt_cstates[2].target_residency = usec;
1252 rdmsrl(MSR_PKGC7_IRTL, msr);
1253 usec = irtl_2_usec(msr);
1255 bxt_cstates[3].exit_latency = usec;
1256 bxt_cstates[3].target_residency = usec;
1259 rdmsrl(MSR_PKGC8_IRTL, msr);
1260 usec = irtl_2_usec(msr);
1262 bxt_cstates[4].exit_latency = usec;
1263 bxt_cstates[4].target_residency = usec;
1266 rdmsrl(MSR_PKGC9_IRTL, msr);
1267 usec = irtl_2_usec(msr);
1269 bxt_cstates[5].exit_latency = usec;
1270 bxt_cstates[5].target_residency = usec;
1273 rdmsrl(MSR_PKGC10_IRTL, msr);
1274 usec = irtl_2_usec(msr);
1276 bxt_cstates[6].exit_latency = usec;
1277 bxt_cstates[6].target_residency = usec;
1282 * sklh_idle_state_table_update(void)
1284 * On SKL-H (model 0x5e) disable C8 and C9 if:
1285 * C10 is enabled and SGX disabled
1287 static void sklh_idle_state_table_update(void)
1289 unsigned long long msr;
1290 unsigned int eax, ebx, ecx, edx;
1293 /* if PC10 disabled via cmdline intel_idle.max_cstate=7 or shallower */
1294 if (max_cstate <= 7)
1297 /* if PC10 not present in CPUID.MWAIT.EDX */
1298 if ((mwait_substates & (0xF << 28)) == 0)
1301 rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr);
1303 /* PC10 is not enabled in PKG C-state limit */
1304 if ((msr & 0xF) != 8)
1308 cpuid(7, &eax, &ebx, &ecx, &edx);
1310 /* if SGX is present */
1311 if (ebx & (1 << 2)) {
1313 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
1315 /* if SGX is enabled */
1316 if (msr & (1 << 18))
1320 skl_cstates[5].disabled = 1; /* C8-SKL */
1321 skl_cstates[6].disabled = 1; /* C9-SKL */
1324 * intel_idle_state_table_update()
1326 * Update the default state_table for this CPU-id
1329 static void intel_idle_state_table_update(void)
1331 switch (boot_cpu_data.x86_model) {
1333 case INTEL_FAM6_IVYBRIDGE_X:
1334 ivt_idle_state_table_update();
1336 case INTEL_FAM6_ATOM_GOLDMONT:
1337 case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
1338 bxt_idle_state_table_update();
1340 case INTEL_FAM6_SKYLAKE:
1341 sklh_idle_state_table_update();
1347 * intel_idle_cpuidle_driver_init()
1348 * allocate, initialize cpuidle_states
1350 static void __init intel_idle_cpuidle_driver_init(void)
1353 struct cpuidle_driver *drv = &intel_idle_driver;
1355 intel_idle_state_table_update();
1357 cpuidle_poll_state_init(drv);
1358 drv->state_count = 1;
1360 for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {
1361 int num_substates, mwait_hint, mwait_cstate;
1363 if ((cpuidle_state_table[cstate].enter == NULL) &&
1364 (cpuidle_state_table[cstate].enter_s2idle == NULL))
1367 if (cstate + 1 > max_cstate) {
1368 pr_info("max_cstate %d reached\n", max_cstate);
1372 mwait_hint = flg2MWAIT(cpuidle_state_table[cstate].flags);
1373 mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint);
1375 /* number of sub-states for this state in CPUID.MWAIT */
1376 num_substates = (mwait_substates >> ((mwait_cstate + 1) * 4))
1377 & MWAIT_SUBSTATE_MASK;
1379 /* if NO sub-states for this state in CPUID, skip it */
1380 if (num_substates == 0)
1383 /* if state marked as disabled, skip it */
1384 if (cpuidle_state_table[cstate].disabled != 0) {
1385 pr_debug("state %s is disabled\n",
1386 cpuidle_state_table[cstate].name);
1391 if (((mwait_cstate + 1) > 2) &&
1392 !boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
1393 mark_tsc_unstable("TSC halts in idle"
1394 " states deeper than C2");
1396 drv->states[drv->state_count] = /* structure copy */
1397 cpuidle_state_table[cstate];
1399 if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) &&
1400 cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_IBRS) {
1401 drv->states[drv->state_count].enter = intel_idle_ibrs;
1404 drv->state_count += 1;
1407 if (icpu->byt_auto_demotion_disable_flag) {
1408 wrmsrl(MSR_CC6_DEMOTION_POLICY_CONFIG, 0);
1409 wrmsrl(MSR_MC6_DEMOTION_POLICY_CONFIG, 0);
1415 * intel_idle_cpu_init()
1416 * allocate, initialize, register cpuidle_devices
1417 * @cpu: cpu/core to initialize
1419 static int intel_idle_cpu_init(unsigned int cpu)
1421 struct cpuidle_device *dev;
1423 dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu);
1426 if (cpuidle_register_device(dev)) {
1427 pr_debug("cpuidle_register_device %d failed!\n", cpu);
1431 if (icpu->auto_demotion_disable_flags)
1432 auto_demotion_disable();
1434 if (icpu->disable_promotion_to_c1e)
1435 c1e_promotion_disable();
1440 static int intel_idle_cpu_online(unsigned int cpu)
1442 struct cpuidle_device *dev;
1444 if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE)
1445 __setup_broadcast_timer(true);
1448 * Some systems can hotplug a cpu at runtime after
1449 * the kernel has booted, we have to initialize the
1450 * driver in this case
1452 dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu);
1453 if (!dev->registered)
1454 return intel_idle_cpu_init(cpu);
1459 static int __init intel_idle_init(void)
1463 /* Do not load intel_idle at all for now if idle= is passed */
1464 if (boot_option_idle_override != IDLE_NO_OVERRIDE)
1467 retval = intel_idle_probe();
1471 intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device);
1472 if (intel_idle_cpuidle_devices == NULL)
1475 intel_idle_cpuidle_driver_init();
1476 retval = cpuidle_register_driver(&intel_idle_driver);
1478 struct cpuidle_driver *drv = cpuidle_get_driver();
1479 printk(KERN_DEBUG pr_fmt("intel_idle yielding to %s\n"),
1480 drv ? drv->name : "none");
1481 goto init_driver_fail;
1484 if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */
1485 lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE;
1487 retval = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "idle/intel:online",
1488 intel_idle_cpu_online, NULL);
1492 pr_debug("lapic_timer_reliable_states 0x%x\n",
1493 lapic_timer_reliable_states);
1498 intel_idle_cpuidle_devices_uninit();
1499 cpuidle_unregister_driver(&intel_idle_driver);
1501 free_percpu(intel_idle_cpuidle_devices);
1505 device_initcall(intel_idle_init);
1508 * We are not really modular, but we used to support that. Meaning we also
1509 * support "intel_idle.max_cstate=..." at boot and also a read-only export of
1510 * it at /sys/module/intel_idle/parameters/max_cstate -- so using module_param
1511 * is the easiest way (currently) to continue doing that.
1513 module_param(max_cstate, int, 0444);