1 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3 #include <linux/errno.h>
4 #include <linux/kernel.h>
7 #include <linux/prctl.h>
8 #include <linux/slab.h>
9 #include <linux/sched.h>
10 #include <linux/init.h>
11 #include <linux/export.h>
13 #include <linux/tick.h>
14 #include <linux/random.h>
15 #include <linux/user-return-notifier.h>
16 #include <linux/dmi.h>
17 #include <linux/utsname.h>
18 #include <linux/stackprotector.h>
19 #include <linux/tick.h>
20 #include <linux/cpuidle.h>
21 #include <trace/events/power.h>
22 #include <linux/hw_breakpoint.h>
25 #include <asm/syscalls.h>
27 #include <asm/uaccess.h>
28 #include <asm/mwait.h>
29 #include <asm/fpu/internal.h>
30 #include <asm/debugreg.h>
32 #include <asm/tlbflush.h>
35 #include <asm/switch_to.h>
36 #include <asm/spec-ctrl.h>
41 * per-CPU TSS segments. Threads are completely 'soft' on Linux,
42 * no more per-task TSS's. The TSS size is kept cacheline-aligned
43 * so they are allowed to end up in the .data..cacheline_aligned
44 * section. Since TSS's are completely CPU-local, we want them
45 * on exact cacheline boundaries, to eliminate cacheline ping-pong.
47 __visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = {
49 .sp0 = TOP_OF_INIT_STACK,
53 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
58 * Note that the .io_bitmap member must be extra-big. This is because
59 * the CPU will access an additional byte beyond the end of the IO
60 * permission bitmap. The extra byte must be all 1 bits, and must
61 * be within the limit.
63 .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
66 .SYSENTER_stack_canary = STACK_END_MAGIC,
69 EXPORT_PER_CPU_SYMBOL(cpu_tss);
72 static DEFINE_PER_CPU(unsigned char, is_idle);
73 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
75 void idle_notifier_register(struct notifier_block *n)
77 atomic_notifier_chain_register(&idle_notifier, n);
79 EXPORT_SYMBOL_GPL(idle_notifier_register);
81 void idle_notifier_unregister(struct notifier_block *n)
83 atomic_notifier_chain_unregister(&idle_notifier, n);
85 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
89 * this gets called so that we can store lazy state into memory and copy the
90 * current task into the new thread.
92 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
94 memcpy(dst, src, arch_task_struct_size);
96 dst->thread.vm86 = NULL;
99 return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
103 * Free current thread data structures etc..
105 void exit_thread(struct task_struct *tsk)
107 struct thread_struct *t = &tsk->thread;
108 unsigned long *bp = t->io_bitmap_ptr;
109 struct fpu *fpu = &t->fpu;
112 struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
114 t->io_bitmap_ptr = NULL;
115 clear_thread_flag(TIF_IO_BITMAP);
117 * Careful, clear this in the TSS too:
119 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
120 t->io_bitmap_max = 0;
130 void flush_thread(void)
132 struct task_struct *tsk = current;
134 flush_ptrace_hw_breakpoint(tsk);
135 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
137 fpu__clear(&tsk->thread.fpu);
140 void disable_TSC(void)
143 if (!test_and_set_thread_flag(TIF_NOTSC))
145 * Must flip the CPU state synchronously with
146 * TIF_NOTSC in the current running context.
148 cr4_set_bits(X86_CR4_TSD);
152 static void enable_TSC(void)
155 if (test_and_clear_thread_flag(TIF_NOTSC))
157 * Must flip the CPU state synchronously with
158 * TIF_NOTSC in the current running context.
160 cr4_clear_bits(X86_CR4_TSD);
164 int get_tsc_mode(unsigned long adr)
168 if (test_thread_flag(TIF_NOTSC))
169 val = PR_TSC_SIGSEGV;
173 return put_user(val, (unsigned int __user *)adr);
176 int set_tsc_mode(unsigned int val)
178 if (val == PR_TSC_SIGSEGV)
180 else if (val == PR_TSC_ENABLE)
188 static inline void switch_to_bitmap(struct thread_struct *prev,
189 struct thread_struct *next,
190 unsigned long tifp, unsigned long tifn)
192 struct tss_struct *tss = this_cpu_ptr(&cpu_tss);
194 if (tifn & _TIF_IO_BITMAP) {
196 * Copy the relevant range of the IO bitmap.
197 * Normally this is 128 bytes or less:
199 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
200 max(prev->io_bitmap_max, next->io_bitmap_max));
201 } else if (tifp & _TIF_IO_BITMAP) {
203 * Clear any possible leftover bits:
205 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
212 struct ssb_state *shared_state;
214 unsigned int disable_state;
215 unsigned long local_state;
220 static DEFINE_PER_CPU(struct ssb_state, ssb_state);
222 void speculative_store_bypass_ht_init(void)
224 struct ssb_state *st = this_cpu_ptr(&ssb_state);
225 unsigned int this_cpu = smp_processor_id();
231 * Shared state setup happens once on the first bringup
232 * of the CPU. It's not destroyed on CPU hotunplug.
234 if (st->shared_state)
237 raw_spin_lock_init(&st->lock);
240 * Go over HT siblings and check whether one of them has set up the
241 * shared state pointer already.
243 for_each_cpu(cpu, topology_sibling_cpumask(this_cpu)) {
247 if (!per_cpu(ssb_state, cpu).shared_state)
250 /* Link it to the state of the sibling: */
251 st->shared_state = per_cpu(ssb_state, cpu).shared_state;
256 * First HT sibling to come up on the core. Link shared state of
257 * the first HT sibling to itself. The siblings on the same core
258 * which come up later will see the shared state pointer and link
259 * themself to the state of this CPU.
261 st->shared_state = st;
265 * Logic is: First HT sibling enables SSBD for both siblings in the core
266 * and last sibling to disable it, disables it for the whole core. This how
267 * MSR_SPEC_CTRL works in "hardware":
269 * CORE_SPEC_CTRL = THREAD0_SPEC_CTRL | THREAD1_SPEC_CTRL
271 static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
273 struct ssb_state *st = this_cpu_ptr(&ssb_state);
274 u64 msr = x86_amd_ls_cfg_base;
276 if (!static_cpu_has(X86_FEATURE_ZEN)) {
277 msr |= ssbd_tif_to_amd_ls_cfg(tifn);
278 wrmsrl(MSR_AMD64_LS_CFG, msr);
282 if (tifn & _TIF_SSBD) {
284 * Since this can race with prctl(), block reentry on the
287 if (__test_and_set_bit(LSTATE_SSB, &st->local_state))
290 msr |= x86_amd_ls_cfg_ssbd_mask;
292 raw_spin_lock(&st->shared_state->lock);
293 /* First sibling enables SSBD: */
294 if (!st->shared_state->disable_state)
295 wrmsrl(MSR_AMD64_LS_CFG, msr);
296 st->shared_state->disable_state++;
297 raw_spin_unlock(&st->shared_state->lock);
299 if (!__test_and_clear_bit(LSTATE_SSB, &st->local_state))
302 raw_spin_lock(&st->shared_state->lock);
303 st->shared_state->disable_state--;
304 if (!st->shared_state->disable_state)
305 wrmsrl(MSR_AMD64_LS_CFG, msr);
306 raw_spin_unlock(&st->shared_state->lock);
310 static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
312 u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn);
314 wrmsrl(MSR_AMD64_LS_CFG, msr);
318 static __always_inline void amd_set_ssb_virt_state(unsigned long tifn)
321 * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL,
322 * so ssbd_tif_to_spec_ctrl() just works.
324 wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn));
328 * Update the MSRs managing speculation control, during context switch.
330 * tifp: Previous task's thread flags
331 * tifn: Next task's thread flags
333 static __always_inline void __speculation_ctrl_update(unsigned long tifp,
336 unsigned long tif_diff = tifp ^ tifn;
337 u64 msr = x86_spec_ctrl_base;
340 /* Handle change of TIF_SSBD depending on the mitigation method. */
341 if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) {
342 if (tif_diff & _TIF_SSBD)
343 amd_set_ssb_virt_state(tifn);
344 } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) {
345 if (tif_diff & _TIF_SSBD)
346 amd_set_core_ssb_state(tifn);
347 } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
348 static_cpu_has(X86_FEATURE_AMD_SSBD)) {
349 updmsr |= !!(tif_diff & _TIF_SSBD);
350 msr |= ssbd_tif_to_spec_ctrl(tifn);
353 /* Only evaluate TIF_SPEC_IB if conditional STIBP is enabled. */
354 if (IS_ENABLED(CONFIG_SMP) &&
355 static_branch_unlikely(&switch_to_cond_stibp)) {
356 updmsr |= !!(tif_diff & _TIF_SPEC_IB);
357 msr |= stibp_tif_to_spec_ctrl(tifn);
361 wrmsrl(MSR_IA32_SPEC_CTRL, msr);
364 static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk)
366 if (test_and_clear_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE)) {
367 if (task_spec_ssb_disable(tsk))
368 set_tsk_thread_flag(tsk, TIF_SSBD);
370 clear_tsk_thread_flag(tsk, TIF_SSBD);
372 if (task_spec_ib_disable(tsk))
373 set_tsk_thread_flag(tsk, TIF_SPEC_IB);
375 clear_tsk_thread_flag(tsk, TIF_SPEC_IB);
377 /* Return the updated threadinfo flags*/
378 return task_thread_info(tsk)->flags;
381 void speculation_ctrl_update(unsigned long tif)
383 /* Forced update. Make sure all relevant TIF flags are different */
385 __speculation_ctrl_update(~tif, tif);
389 /* Called from seccomp/prctl update */
390 void speculation_ctrl_update_current(void)
393 speculation_ctrl_update(speculation_ctrl_update_tif(current));
397 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
399 struct thread_struct *prev, *next;
400 unsigned long tifp, tifn;
402 prev = &prev_p->thread;
403 next = &next_p->thread;
405 tifn = READ_ONCE(task_thread_info(next_p)->flags);
406 tifp = READ_ONCE(task_thread_info(prev_p)->flags);
407 switch_to_bitmap(prev, next, tifp, tifn);
409 propagate_user_return_notify(prev_p, next_p);
411 if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) &&
412 arch_has_block_step()) {
413 unsigned long debugctl, msk;
415 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
416 debugctl &= ~DEBUGCTLMSR_BTF;
417 msk = tifn & _TIF_BLOCKSTEP;
418 debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT;
419 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
422 if ((tifp ^ tifn) & _TIF_NOTSC)
423 cr4_toggle_bits(X86_CR4_TSD);
425 if (likely(!((tifp | tifn) & _TIF_SPEC_FORCE_UPDATE))) {
426 __speculation_ctrl_update(tifp, tifn);
428 speculation_ctrl_update_tif(prev_p);
429 tifn = speculation_ctrl_update_tif(next_p);
431 /* Enforce MSR update to ensure consistent state */
432 __speculation_ctrl_update(~tifn, tifn);
437 * Idle related variables and functions
439 unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
440 EXPORT_SYMBOL(boot_option_idle_override);
442 static void (*x86_idle)(void);
445 static inline void play_dead(void)
452 void enter_idle(void)
454 this_cpu_write(is_idle, 1);
455 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
458 static void __exit_idle(void)
460 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
462 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
465 /* Called from interrupts to signify idle end */
468 /* idle loop has pid 0 */
475 void arch_cpu_idle_enter(void)
481 void arch_cpu_idle_exit(void)
486 void arch_cpu_idle_dead(void)
492 * Called from the generic idle code.
494 void arch_cpu_idle(void)
500 * We use this if we don't have any better idle routine..
502 void __cpuidle default_idle(void)
504 trace_cpu_idle_rcuidle(1, smp_processor_id());
506 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
508 #ifdef CONFIG_APM_MODULE
509 EXPORT_SYMBOL(default_idle);
513 bool xen_set_default_idle(void)
515 bool ret = !!x86_idle;
517 x86_idle = default_idle;
522 void stop_this_cpu(void *dummy)
528 set_cpu_online(smp_processor_id(), false);
529 disable_local_APIC();
530 mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
536 bool amd_e400_c1e_detected;
537 EXPORT_SYMBOL(amd_e400_c1e_detected);
539 static cpumask_var_t amd_e400_c1e_mask;
541 void amd_e400_remove_cpu(int cpu)
543 if (amd_e400_c1e_mask != NULL)
544 cpumask_clear_cpu(cpu, amd_e400_c1e_mask);
548 * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt
549 * pending message MSR. If we detect C1E, then we handle it the same
550 * way as C3 power states (local apic timer and TSC stop)
552 static void amd_e400_idle(void)
554 if (!amd_e400_c1e_detected) {
557 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
559 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
560 amd_e400_c1e_detected = true;
561 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
562 mark_tsc_unstable("TSC halt in AMD C1E");
563 pr_info("System has AMD C1E enabled\n");
567 if (amd_e400_c1e_detected) {
568 int cpu = smp_processor_id();
570 if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
571 cpumask_set_cpu(cpu, amd_e400_c1e_mask);
572 /* Force broadcast so ACPI can not interfere. */
573 tick_broadcast_force();
574 pr_info("Switch to broadcast mode on CPU%d\n", cpu);
576 tick_broadcast_enter();
581 * The switch back from broadcast mode needs to be
582 * called with interrupts disabled.
585 tick_broadcast_exit();
592 * Intel Core2 and older machines prefer MWAIT over HALT for C1.
593 * We can't rely on cpuidle installing MWAIT, because it will not load
594 * on systems that support only C1 -- so the boot default must be MWAIT.
596 * Some AMD machines are the opposite, they depend on using HALT.
598 * So for default C1, which is used during boot until cpuidle loads,
599 * use MWAIT-C1 on Intel HW that has it, else use HALT.
601 static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
603 if (c->x86_vendor != X86_VENDOR_INTEL)
606 if (!cpu_has(c, X86_FEATURE_MWAIT) || static_cpu_has_bug(X86_BUG_MONITOR))
613 * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT
614 * with interrupts enabled and no flags, which is backwards compatible with the
615 * original MWAIT implementation.
617 static __cpuidle void mwait_idle(void)
619 if (!current_set_polling_and_test()) {
620 trace_cpu_idle_rcuidle(1, smp_processor_id());
621 if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
623 clflush((void *)¤t_thread_info()->flags);
627 __monitor((void *)¤t_thread_info()->flags, 0, 0);
632 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
636 __current_clr_polling();
639 void select_idle_routine(const struct cpuinfo_x86 *c)
642 if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1)
643 pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
645 if (x86_idle || boot_option_idle_override == IDLE_POLL)
648 if (boot_cpu_has_bug(X86_BUG_AMD_E400)) {
649 pr_info("using AMD E400 aware idle routine\n");
650 x86_idle = amd_e400_idle;
651 } else if (prefer_mwait_c1_over_halt(c)) {
652 pr_info("using mwait in idle threads\n");
653 x86_idle = mwait_idle;
655 x86_idle = default_idle;
658 void __init init_amd_e400_c1e_mask(void)
660 /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */
661 if (x86_idle == amd_e400_idle)
662 zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL);
665 static int __init idle_setup(char *str)
670 if (!strcmp(str, "poll")) {
671 pr_info("using polling idle threads\n");
672 boot_option_idle_override = IDLE_POLL;
673 cpu_idle_poll_ctrl(true);
674 } else if (!strcmp(str, "halt")) {
676 * When the boot option of idle=halt is added, halt is
677 * forced to be used for CPU idle. In such case CPU C2/C3
678 * won't be used again.
679 * To continue to load the CPU idle driver, don't touch
680 * the boot_option_idle_override.
682 x86_idle = default_idle;
683 boot_option_idle_override = IDLE_HALT;
684 } else if (!strcmp(str, "nomwait")) {
686 * If the boot option of "idle=nomwait" is added,
687 * it means that mwait will be disabled for CPU C2/C3
688 * states. In such case it won't touch the variable
689 * of boot_option_idle_override.
691 boot_option_idle_override = IDLE_NOMWAIT;
697 early_param("idle", idle_setup);
699 unsigned long arch_align_stack(unsigned long sp)
701 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
702 sp -= get_random_int() % 8192;
706 unsigned long arch_randomize_brk(struct mm_struct *mm)
708 return randomize_page(mm->brk, 0x02000000);
712 * Return saved PC of a blocked thread.
713 * What is this good for? it will be always the scheduler or ret_from_fork.
715 unsigned long thread_saved_pc(struct task_struct *tsk)
717 struct inactive_task_frame *frame =
718 (struct inactive_task_frame *) READ_ONCE(tsk->thread.sp);
719 return READ_ONCE_NOCHECK(frame->ret_addr);
723 * Called from fs/proc with a reference on @p to find the function
724 * which called into schedule(). This needs to be done carefully
725 * because the task might wake up and we might look at a stack
728 unsigned long get_wchan(struct task_struct *p)
730 unsigned long start, bottom, top, sp, fp, ip, ret = 0;
733 if (!p || p == current || p->state == TASK_RUNNING)
736 if (!try_get_task_stack(p))
739 start = (unsigned long)task_stack_page(p);
744 * Layout of the stack page:
746 * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long)
748 * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING
750 * ----------- bottom = start
752 * The tasks stack pointer points at the location where the
753 * framepointer is stored. The data on the stack is:
754 * ... IP FP ... IP FP
756 * We need to read FP and IP, so we need to adjust the upper
757 * bound by another unsigned long.
759 top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;
760 top -= 2 * sizeof(unsigned long);
763 sp = READ_ONCE(p->thread.sp);
764 if (sp < bottom || sp > top)
767 fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp);
769 if (fp < bottom || fp > top)
771 ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long)));
772 if (!in_sched_functions(ip)) {
776 fp = READ_ONCE_NOCHECK(*(unsigned long *)fp);
777 } while (count++ < 16 && p->state != TASK_RUNNING);