1 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3 #include <linux/errno.h>
4 #include <linux/kernel.h>
7 #include <linux/prctl.h>
8 #include <linux/slab.h>
9 #include <linux/sched.h>
10 #include <linux/module.h>
12 #include <linux/tick.h>
13 #include <linux/random.h>
14 #include <linux/user-return-notifier.h>
15 #include <linux/dmi.h>
16 #include <linux/utsname.h>
17 #include <linux/stackprotector.h>
18 #include <linux/tick.h>
19 #include <linux/cpuidle.h>
20 #include <trace/events/power.h>
21 #include <linux/hw_breakpoint.h>
24 #include <asm/syscalls.h>
26 #include <asm/uaccess.h>
27 #include <asm/mwait.h>
28 #include <asm/fpu/internal.h>
29 #include <asm/debugreg.h>
31 #include <asm/tlbflush.h>
34 #include <asm/spec-ctrl.h>
39 * per-CPU TSS segments. Threads are completely 'soft' on Linux,
40 * no more per-task TSS's. The TSS size is kept cacheline-aligned
41 * so they are allowed to end up in the .data..cacheline_aligned
42 * section. Since TSS's are completely CPU-local, we want them
43 * on exact cacheline boundaries, to eliminate cacheline ping-pong.
45 __visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = {
47 .sp0 = TOP_OF_INIT_STACK,
51 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
56 * Note that the .io_bitmap member must be extra-big. This is because
57 * the CPU will access an additional byte beyond the end of the IO
58 * permission bitmap. The extra byte must be all 1 bits, and must
59 * be within the limit.
61 .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
64 EXPORT_PER_CPU_SYMBOL(cpu_tss);
67 static DEFINE_PER_CPU(unsigned char, is_idle);
68 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
70 void idle_notifier_register(struct notifier_block *n)
72 atomic_notifier_chain_register(&idle_notifier, n);
74 EXPORT_SYMBOL_GPL(idle_notifier_register);
76 void idle_notifier_unregister(struct notifier_block *n)
78 atomic_notifier_chain_unregister(&idle_notifier, n);
80 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
84 * this gets called so that we can store lazy state into memory and copy the
85 * current task into the new thread.
87 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
89 memcpy(dst, src, arch_task_struct_size);
91 dst->thread.vm86 = NULL;
94 return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
98 * Free current thread data structures etc..
100 void exit_thread(void)
102 struct task_struct *me = current;
103 struct thread_struct *t = &me->thread;
104 unsigned long *bp = t->io_bitmap_ptr;
105 struct fpu *fpu = &t->fpu;
108 struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
110 t->io_bitmap_ptr = NULL;
111 clear_thread_flag(TIF_IO_BITMAP);
113 * Careful, clear this in the TSS too:
115 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
116 t->io_bitmap_max = 0;
126 void flush_thread(void)
128 struct task_struct *tsk = current;
130 flush_ptrace_hw_breakpoint(tsk);
131 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
133 fpu__clear(&tsk->thread.fpu);
136 void disable_TSC(void)
139 if (!test_and_set_thread_flag(TIF_NOTSC))
141 * Must flip the CPU state synchronously with
142 * TIF_NOTSC in the current running context.
144 cr4_set_bits(X86_CR4_TSD);
148 static void enable_TSC(void)
151 if (test_and_clear_thread_flag(TIF_NOTSC))
153 * Must flip the CPU state synchronously with
154 * TIF_NOTSC in the current running context.
156 cr4_clear_bits(X86_CR4_TSD);
160 int get_tsc_mode(unsigned long adr)
164 if (test_thread_flag(TIF_NOTSC))
165 val = PR_TSC_SIGSEGV;
169 return put_user(val, (unsigned int __user *)adr);
172 int set_tsc_mode(unsigned int val)
174 if (val == PR_TSC_SIGSEGV)
176 else if (val == PR_TSC_ENABLE)
184 static inline void switch_to_bitmap(struct thread_struct *prev,
185 struct thread_struct *next,
186 unsigned long tifp, unsigned long tifn)
188 struct tss_struct *tss = this_cpu_ptr(&cpu_tss);
190 if (tifn & _TIF_IO_BITMAP) {
192 * Copy the relevant range of the IO bitmap.
193 * Normally this is 128 bytes or less:
195 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
196 max(prev->io_bitmap_max, next->io_bitmap_max));
197 } else if (tifp & _TIF_IO_BITMAP) {
199 * Clear any possible leftover bits:
201 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
208 struct ssb_state *shared_state;
210 unsigned int disable_state;
211 unsigned long local_state;
216 static DEFINE_PER_CPU(struct ssb_state, ssb_state);
218 void speculative_store_bypass_ht_init(void)
220 struct ssb_state *st = this_cpu_ptr(&ssb_state);
221 unsigned int this_cpu = smp_processor_id();
227 * Shared state setup happens once on the first bringup
228 * of the CPU. It's not destroyed on CPU hotunplug.
230 if (st->shared_state)
233 raw_spin_lock_init(&st->lock);
236 * Go over HT siblings and check whether one of them has set up the
237 * shared state pointer already.
239 for_each_cpu(cpu, topology_sibling_cpumask(this_cpu)) {
243 if (!per_cpu(ssb_state, cpu).shared_state)
246 /* Link it to the state of the sibling: */
247 st->shared_state = per_cpu(ssb_state, cpu).shared_state;
252 * First HT sibling to come up on the core. Link shared state of
253 * the first HT sibling to itself. The siblings on the same core
254 * which come up later will see the shared state pointer and link
255 * themself to the state of this CPU.
257 st->shared_state = st;
261 * Logic is: First HT sibling enables SSBD for both siblings in the core
262 * and last sibling to disable it, disables it for the whole core. This how
263 * MSR_SPEC_CTRL works in "hardware":
265 * CORE_SPEC_CTRL = THREAD0_SPEC_CTRL | THREAD1_SPEC_CTRL
267 static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
269 struct ssb_state *st = this_cpu_ptr(&ssb_state);
270 u64 msr = x86_amd_ls_cfg_base;
272 if (!static_cpu_has(X86_FEATURE_ZEN)) {
273 msr |= ssbd_tif_to_amd_ls_cfg(tifn);
274 wrmsrl(MSR_AMD64_LS_CFG, msr);
278 if (tifn & _TIF_SSBD) {
280 * Since this can race with prctl(), block reentry on the
283 if (__test_and_set_bit(LSTATE_SSB, &st->local_state))
286 msr |= x86_amd_ls_cfg_ssbd_mask;
288 raw_spin_lock(&st->shared_state->lock);
289 /* First sibling enables SSBD: */
290 if (!st->shared_state->disable_state)
291 wrmsrl(MSR_AMD64_LS_CFG, msr);
292 st->shared_state->disable_state++;
293 raw_spin_unlock(&st->shared_state->lock);
295 if (!__test_and_clear_bit(LSTATE_SSB, &st->local_state))
298 raw_spin_lock(&st->shared_state->lock);
299 st->shared_state->disable_state--;
300 if (!st->shared_state->disable_state)
301 wrmsrl(MSR_AMD64_LS_CFG, msr);
302 raw_spin_unlock(&st->shared_state->lock);
306 static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
308 u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn);
310 wrmsrl(MSR_AMD64_LS_CFG, msr);
314 static __always_inline void amd_set_ssb_virt_state(unsigned long tifn)
317 * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL,
318 * so ssbd_tif_to_spec_ctrl() just works.
320 wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn));
324 * Update the MSRs managing speculation control, during context switch.
326 * tifp: Previous task's thread flags
327 * tifn: Next task's thread flags
329 static __always_inline void __speculation_ctrl_update(unsigned long tifp,
332 unsigned long tif_diff = tifp ^ tifn;
333 u64 msr = x86_spec_ctrl_base;
336 /* Handle change of TIF_SSBD depending on the mitigation method. */
337 if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) {
338 if (tif_diff & _TIF_SSBD)
339 amd_set_ssb_virt_state(tifn);
340 } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) {
341 if (tif_diff & _TIF_SSBD)
342 amd_set_core_ssb_state(tifn);
343 } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
344 static_cpu_has(X86_FEATURE_AMD_SSBD)) {
345 updmsr |= !!(tif_diff & _TIF_SSBD);
346 msr |= ssbd_tif_to_spec_ctrl(tifn);
349 /* Only evaluate TIF_SPEC_IB if conditional STIBP is enabled. */
350 if (IS_ENABLED(CONFIG_SMP) &&
351 static_branch_unlikely(&switch_to_cond_stibp)) {
352 updmsr |= !!(tif_diff & _TIF_SPEC_IB);
353 msr |= stibp_tif_to_spec_ctrl(tifn);
357 wrmsrl(MSR_IA32_SPEC_CTRL, msr);
360 static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk)
362 if (test_and_clear_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE)) {
363 if (task_spec_ssb_disable(tsk))
364 set_tsk_thread_flag(tsk, TIF_SSBD);
366 clear_tsk_thread_flag(tsk, TIF_SSBD);
368 if (task_spec_ib_disable(tsk))
369 set_tsk_thread_flag(tsk, TIF_SPEC_IB);
371 clear_tsk_thread_flag(tsk, TIF_SPEC_IB);
373 /* Return the updated threadinfo flags*/
374 return task_thread_info(tsk)->flags;
377 void speculation_ctrl_update(unsigned long tif)
379 /* Forced update. Make sure all relevant TIF flags are different */
381 __speculation_ctrl_update(~tif, tif);
385 /* Called from seccomp/prctl update */
386 void speculation_ctrl_update_current(void)
389 speculation_ctrl_update(speculation_ctrl_update_tif(current));
393 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
395 struct thread_struct *prev, *next;
396 unsigned long tifp, tifn;
398 prev = &prev_p->thread;
399 next = &next_p->thread;
401 tifn = READ_ONCE(task_thread_info(next_p)->flags);
402 tifp = READ_ONCE(task_thread_info(prev_p)->flags);
403 switch_to_bitmap(prev, next, tifp, tifn);
405 propagate_user_return_notify(prev_p, next_p);
407 if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) &&
408 arch_has_block_step()) {
409 unsigned long debugctl, msk;
411 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
412 debugctl &= ~DEBUGCTLMSR_BTF;
413 msk = tifn & _TIF_BLOCKSTEP;
414 debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT;
415 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
418 if ((tifp ^ tifn) & _TIF_NOTSC)
419 cr4_toggle_bits(X86_CR4_TSD);
421 if (likely(!((tifp | tifn) & _TIF_SPEC_FORCE_UPDATE))) {
422 __speculation_ctrl_update(tifp, tifn);
424 speculation_ctrl_update_tif(prev_p);
425 tifn = speculation_ctrl_update_tif(next_p);
427 /* Enforce MSR update to ensure consistent state */
428 __speculation_ctrl_update(~tifn, tifn);
433 * Idle related variables and functions
435 unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
436 EXPORT_SYMBOL(boot_option_idle_override);
438 static void (*x86_idle)(void);
441 static inline void play_dead(void)
448 void enter_idle(void)
450 this_cpu_write(is_idle, 1);
451 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
454 static void __exit_idle(void)
456 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
458 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
461 /* Called from interrupts to signify idle end */
464 /* idle loop has pid 0 */
471 void arch_cpu_idle_enter(void)
477 void arch_cpu_idle_exit(void)
482 void arch_cpu_idle_dead(void)
488 * Called from the generic idle code.
490 void arch_cpu_idle(void)
496 * We use this if we don't have any better idle routine..
498 void default_idle(void)
500 trace_cpu_idle_rcuidle(1, smp_processor_id());
502 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
504 #ifdef CONFIG_APM_MODULE
505 EXPORT_SYMBOL(default_idle);
509 bool xen_set_default_idle(void)
511 bool ret = !!x86_idle;
513 x86_idle = default_idle;
518 void stop_this_cpu(void *dummy)
524 set_cpu_online(smp_processor_id(), false);
525 disable_local_APIC();
526 mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
532 bool amd_e400_c1e_detected;
533 EXPORT_SYMBOL(amd_e400_c1e_detected);
535 static cpumask_var_t amd_e400_c1e_mask;
537 void amd_e400_remove_cpu(int cpu)
539 if (amd_e400_c1e_mask != NULL)
540 cpumask_clear_cpu(cpu, amd_e400_c1e_mask);
544 * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt
545 * pending message MSR. If we detect C1E, then we handle it the same
546 * way as C3 power states (local apic timer and TSC stop)
548 static void amd_e400_idle(void)
550 if (!amd_e400_c1e_detected) {
553 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
555 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
556 amd_e400_c1e_detected = true;
557 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
558 mark_tsc_unstable("TSC halt in AMD C1E");
559 pr_info("System has AMD C1E enabled\n");
563 if (amd_e400_c1e_detected) {
564 int cpu = smp_processor_id();
566 if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
567 cpumask_set_cpu(cpu, amd_e400_c1e_mask);
568 /* Force broadcast so ACPI can not interfere. */
569 tick_broadcast_force();
570 pr_info("Switch to broadcast mode on CPU%d\n", cpu);
572 tick_broadcast_enter();
577 * The switch back from broadcast mode needs to be
578 * called with interrupts disabled.
581 tick_broadcast_exit();
588 * Intel Core2 and older machines prefer MWAIT over HALT for C1.
589 * We can't rely on cpuidle installing MWAIT, because it will not load
590 * on systems that support only C1 -- so the boot default must be MWAIT.
592 * Some AMD machines are the opposite, they depend on using HALT.
594 * So for default C1, which is used during boot until cpuidle loads,
595 * use MWAIT-C1 on Intel HW that has it, else use HALT.
597 static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
599 if (c->x86_vendor != X86_VENDOR_INTEL)
602 if (!cpu_has(c, X86_FEATURE_MWAIT))
609 * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT
610 * with interrupts enabled and no flags, which is backwards compatible with the
611 * original MWAIT implementation.
613 static void mwait_idle(void)
615 if (!current_set_polling_and_test()) {
616 trace_cpu_idle_rcuidle(1, smp_processor_id());
617 if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
618 smp_mb(); /* quirk */
619 clflush((void *)¤t_thread_info()->flags);
620 smp_mb(); /* quirk */
623 __monitor((void *)¤t_thread_info()->flags, 0, 0);
628 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
632 __current_clr_polling();
635 void select_idle_routine(const struct cpuinfo_x86 *c)
638 if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1)
639 pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
641 if (x86_idle || boot_option_idle_override == IDLE_POLL)
644 if (cpu_has_bug(c, X86_BUG_AMD_APIC_C1E)) {
645 /* E400: APIC timer interrupt does not wake up CPU from C1e */
646 pr_info("using AMD E400 aware idle routine\n");
647 x86_idle = amd_e400_idle;
648 } else if (prefer_mwait_c1_over_halt(c)) {
649 pr_info("using mwait in idle threads\n");
650 x86_idle = mwait_idle;
652 x86_idle = default_idle;
655 void __init init_amd_e400_c1e_mask(void)
657 /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */
658 if (x86_idle == amd_e400_idle)
659 zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL);
662 static int __init idle_setup(char *str)
667 if (!strcmp(str, "poll")) {
668 pr_info("using polling idle threads\n");
669 boot_option_idle_override = IDLE_POLL;
670 cpu_idle_poll_ctrl(true);
671 } else if (!strcmp(str, "halt")) {
673 * When the boot option of idle=halt is added, halt is
674 * forced to be used for CPU idle. In such case CPU C2/C3
675 * won't be used again.
676 * To continue to load the CPU idle driver, don't touch
677 * the boot_option_idle_override.
679 x86_idle = default_idle;
680 boot_option_idle_override = IDLE_HALT;
681 } else if (!strcmp(str, "nomwait")) {
683 * If the boot option of "idle=nomwait" is added,
684 * it means that mwait will be disabled for CPU C2/C3
685 * states. In such case it won't touch the variable
686 * of boot_option_idle_override.
688 boot_option_idle_override = IDLE_NOMWAIT;
694 early_param("idle", idle_setup);
696 unsigned long arch_align_stack(unsigned long sp)
698 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
699 sp -= get_random_int() % 8192;
703 unsigned long arch_randomize_brk(struct mm_struct *mm)
705 unsigned long range_end = mm->brk + 0x02000000;
706 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
710 * Called from fs/proc with a reference on @p to find the function
711 * which called into schedule(). This needs to be done carefully
712 * because the task might wake up and we might look at a stack
715 unsigned long get_wchan(struct task_struct *p)
717 unsigned long start, bottom, top, sp, fp, ip;
720 if (!p || p == current || p->state == TASK_RUNNING)
723 start = (unsigned long)task_stack_page(p);
728 * Layout of the stack page:
730 * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long)
732 * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING
734 * ----------- bottom = start + sizeof(thread_info)
738 * The tasks stack pointer points at the location where the
739 * framepointer is stored. The data on the stack is:
740 * ... IP FP ... IP FP
742 * We need to read FP and IP, so we need to adjust the upper
743 * bound by another unsigned long.
745 top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;
746 top -= 2 * sizeof(unsigned long);
747 bottom = start + sizeof(struct thread_info);
749 sp = READ_ONCE(p->thread.sp);
750 if (sp < bottom || sp > top)
753 fp = READ_ONCE_NOCHECK(*(unsigned long *)sp);
755 if (fp < bottom || fp > top)
757 ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long)));
758 if (!in_sched_functions(ip))
760 fp = READ_ONCE_NOCHECK(*(unsigned long *)fp);
761 } while (count++ < 16 && p->state != TASK_RUNNING);