arch/x86/kernel/cpu/mce/core.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Machine check handler.
   4  *
   5  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   6  * Rest from unknown author(s).
   7  * 2004 Andi Kleen. Rewrote most of it.
   8  * Copyright 2008 Intel Corporation
   9  * Author: Andi Kleen
  10  */
  11
  12 #include <linux/thread_info.h>
  13 #include <linux/capability.h>
  14 #include <linux/miscdevice.h>
  15 #include <linux/ratelimit.h>
  16 #include <linux/rcupdate.h>
  17 #include <linux/kobject.h>
  18 #include <linux/uaccess.h>
  19 #include <linux/kdebug.h>
  20 #include <linux/kernel.h>
  21 #include <linux/percpu.h>
  22 #include <linux/string.h>
  23 #include <linux/device.h>
  24 #include <linux/syscore_ops.h>
  25 #include <linux/delay.h>
  26 #include <linux/ctype.h>
  27 #include <linux/sched.h>
  28 #include <linux/sysfs.h>
  29 #include <linux/types.h>
  30 #include <linux/slab.h>
  31 #include <linux/init.h>
  32 #include <linux/kmod.h>
  33 #include <linux/poll.h>
  34 #include <linux/nmi.h>
  35 #include <linux/cpu.h>
  36 #include <linux/ras.h>
  37 #include <linux/smp.h>
  38 #include <linux/fs.h>
  39 #include <linux/mm.h>
  40 #include <linux/debugfs.h>
  41 #include <linux/irq_work.h>
  42 #include <linux/export.h>
  43 #include <linux/set_memory.h>
  44 #include <linux/sync_core.h>
  45 #include <linux/task_work.h>
  46 #include <linux/hardirq.h>
  47
  48 #include <asm/intel-family.h>
  49 #include <asm/processor.h>
  50 #include <asm/traps.h>
  51 #include <asm/tlbflush.h>
  52 #include <asm/mce.h>
  53 #include <asm/msr.h>
  54 #include <asm/reboot.h>
  55
  56 #include "internal.h"
  57
  58 /* sysfs synchronization */
  59 static DEFINE_MUTEX(mce_sysfs_mutex);
  60
  61 #define CREATE_TRACE_POINTS
  62 #include <trace/events/mce.h>
  63
  64 #define SPINUNIT                100     /* 100ns */
  65
  66 DEFINE_PER_CPU(unsigned, mce_exception_count);
  67
  68 DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
  69
  70 struct mce_bank {
  71         u64                     ctl;                    /* subevents to enable */
  72
  73         __u64 init                      : 1,            /* initialise bank? */
  74               __reserved_1              : 63;
  75 };
  76 static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
  77
  78 #define ATTR_LEN               16
  79 /* One object for each MCE bank, shared by all CPUs */
  80 struct mce_bank_dev {
  81         struct device_attribute attr;                   /* device attribute */
  82         char                    attrname[ATTR_LEN];     /* attribute name */
  83         u8                      bank;                   /* bank number */
  84 };
  85 static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS];
  86
  87 struct mce_vendor_flags mce_flags __read_mostly;
  88
  89 struct mca_config mca_cfg __read_mostly = {
  90         .bootlog  = -1,
  91         .monarch_timeout = -1
  92 };
  93
  94 static DEFINE_PER_CPU(struct mce, mces_seen);
  95 static unsigned long mce_need_notify;
  96
  97 /*
  98  * MCA banks polled by the period polling timer for corrected events.
  99  * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
 100  */
 101 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
 102         [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
 103 };
 104
 105 /*
 106  * MCA banks controlled through firmware first for corrected errors.
 107  * This is a global list of banks for which we won't enable CMCI and we
 108  * won't poll. Firmware controls these banks and is responsible for
 109  * reporting corrected errors through GHES. Uncorrected/recoverable
 110  * errors are still notified through a machine check.
 111  */
 112 mce_banks_t mce_banks_ce_disabled;
 113
 114 static struct work_struct mce_work;
 115 static struct irq_work mce_irq_work;
 116
 117 /*
 118  * CPU/chipset specific EDAC code can register a notifier call here to print
 119  * MCE errors in a human-readable form.
 120  */
 121 BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
 122
 123 /* Do initial initialization of a struct mce */
 124 void mce_setup(struct mce *m)
 125 {
 126         memset(m, 0, sizeof(struct mce));
 127         m->cpu = m->extcpu = smp_processor_id();
 128         /* need the internal __ version to avoid deadlocks */
 129         m->time = __ktime_get_real_seconds();
 130         m->cpuvendor = boot_cpu_data.x86_vendor;
 131         m->cpuid = cpuid_eax(1);
 132         m->socketid = cpu_data(m->extcpu).phys_proc_id;
 133         m->apicid = cpu_data(m->extcpu).initial_apicid;
 134         m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
 135         m->ppin = cpu_data(m->extcpu).ppin;
 136         m->microcode = boot_cpu_data.microcode;
 137 }
 138
 139 DEFINE_PER_CPU(struct mce, injectm);
 140 EXPORT_PER_CPU_SYMBOL_GPL(injectm);
 141
 142 void mce_log(struct mce *m)
 143 {
 144         if (!mce_gen_pool_add(m))
 145                 irq_work_queue(&mce_irq_work);
 146 }
 147 EXPORT_SYMBOL_GPL(mce_log);
 148
 149 void mce_register_decode_chain(struct notifier_block *nb)
 150 {
 151         if (WARN_ON(nb->priority < MCE_PRIO_LOWEST ||
 152                     nb->priority > MCE_PRIO_HIGHEST))
 153                 return;
 154
 155         blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
 156 }
 157 EXPORT_SYMBOL_GPL(mce_register_decode_chain);
 158
 159 void mce_unregister_decode_chain(struct notifier_block *nb)
 160 {
 161         blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
 162 }
 163 EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
 164
 165 static void __print_mce(struct mce *m)
 166 {
 167         pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
 168                  m->extcpu,
 169                  (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
 170                  m->mcgstatus, m->bank, m->status);
 171
 172         if (m->ip) {
 173                 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
 174                         !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 175                         m->cs, m->ip);
 176
 177                 if (m->cs == __KERNEL_CS)
 178                         pr_cont("{%pS}", (void *)(unsigned long)m->ip);
 179                 pr_cont("\n");
 180         }
 181
 182         pr_emerg(HW_ERR "TSC %llx ", m->tsc);
 183         if (m->addr)
 184                 pr_cont("ADDR %llx ", m->addr);
 185         if (m->misc)
 186                 pr_cont("MISC %llx ", m->misc);
 187         if (m->ppin)
 188                 pr_cont("PPIN %llx ", m->ppin);
 189
 190         if (mce_flags.smca) {
 191                 if (m->synd)
 192                         pr_cont("SYND %llx ", m->synd);
 193                 if (m->ipid)
 194                         pr_cont("IPID %llx ", m->ipid);
 195         }
 196
 197         pr_cont("\n");
 198
 199         /*
 200          * Note this output is parsed by external tools and old fields
 201          * should not be changed.
 202          */
 203         pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
 204                 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
 205                 m->microcode);
 206 }
 207
 208 static void print_mce(struct mce *m)
 209 {
 210         __print_mce(m);
 211
 212         if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON)
 213                 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
 214 }
 215
 216 #define PANIC_TIMEOUT 5 /* 5 seconds */
 217
 218 static atomic_t mce_panicked;
 219
 220 static int fake_panic;
 221 static atomic_t mce_fake_panicked;
 222
 223 /* Panic in progress. Enable interrupts and wait for final IPI */
 224 static void wait_for_panic(void)
 225 {
 226         long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
 227
 228         preempt_disable();
 229         local_irq_enable();
 230         while (timeout-- > 0)
 231                 udelay(1);
 232         if (panic_timeout == 0)
 233                 panic_timeout = mca_cfg.panic_timeout;
 234         panic("Panicing machine check CPU died");
 235 }
 236
 237 static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
 238 {
 239         struct llist_node *pending;
 240         struct mce_evt_llist *l;
 241         int apei_err = 0;
 242
 243         /*
 244          * Allow instrumentation around external facilities usage. Not that it
 245          * matters a whole lot since the machine is going to panic anyway.
 246          */
 247         instrumentation_begin();
 248
 249         if (!fake_panic) {
 250                 /*
 251                  * Make sure only one CPU runs in machine check panic
 252                  */
 253                 if (atomic_inc_return(&mce_panicked) > 1)
 254                         wait_for_panic();
 255                 barrier();
 256
 257                 bust_spinlocks(1);
 258                 console_verbose();
 259         } else {
 260                 /* Don't log too much for fake panic */
 261                 if (atomic_inc_return(&mce_fake_panicked) > 1)
 262                         goto out;
 263         }
 264         pending = mce_gen_pool_prepare_records();
 265         /* First print corrected ones that are still unlogged */
 266         llist_for_each_entry(l, pending, llnode) {
 267                 struct mce *m = &l->mce;
 268                 if (!(m->status & MCI_STATUS_UC)) {
 269                         print_mce(m);
 270                         if (!apei_err)
 271                                 apei_err = apei_write_mce(m);
 272                 }
 273         }
 274         /* Now print uncorrected but with the final one last */
 275         llist_for_each_entry(l, pending, llnode) {
 276                 struct mce *m = &l->mce;
 277                 if (!(m->status & MCI_STATUS_UC))
 278                         continue;
 279                 if (!final || mce_cmp(m, final)) {
 280                         print_mce(m);
 281                         if (!apei_err)
 282                                 apei_err = apei_write_mce(m);
 283                 }
 284         }
 285         if (final) {
 286                 print_mce(final);
 287                 if (!apei_err)
 288                         apei_err = apei_write_mce(final);
 289         }
 290         if (exp)
 291                 pr_emerg(HW_ERR "Machine check: %s\n", exp);
 292         if (!fake_panic) {
 293                 if (panic_timeout == 0)
 294                         panic_timeout = mca_cfg.panic_timeout;
 295                 panic(msg);
 296         } else
 297                 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
 298
 299 out:
 300         instrumentation_end();
 301 }
 302
 303 /* Support code for software error injection */
 304
 305 static int msr_to_offset(u32 msr)
 306 {
 307         unsigned bank = __this_cpu_read(injectm.bank);
 308
 309         if (msr == mca_cfg.rip_msr)
 310                 return offsetof(struct mce, ip);
 311         if (msr == mca_msr_reg(bank, MCA_STATUS))
 312                 return offsetof(struct mce, status);
 313         if (msr == mca_msr_reg(bank, MCA_ADDR))
 314                 return offsetof(struct mce, addr);
 315         if (msr == mca_msr_reg(bank, MCA_MISC))
 316                 return offsetof(struct mce, misc);
 317         if (msr == MSR_IA32_MCG_STATUS)
 318                 return offsetof(struct mce, mcgstatus);
 319         return -1;
 320 }
 321
 322 void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
 323 {
 324         if (wrmsr) {
 325                 pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
 326                          (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax,
 327                          regs->ip, (void *)regs->ip);
 328         } else {
 329                 pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
 330                          (unsigned int)regs->cx, regs->ip, (void *)regs->ip);
 331         }
 332
 333         show_stack_regs(regs);
 334
 335         panic("MCA architectural violation!\n");
 336
 337         while (true)
 338                 cpu_relax();
 339 }
 340
 341 /* MSR access wrappers used for error injection */
 342 noinstr u64 mce_rdmsrl(u32 msr)
 343 {
 344         DECLARE_ARGS(val, low, high);
 345
 346         if (__this_cpu_read(injectm.finished)) {
 347                 int offset;
 348                 u64 ret;
 349
 350                 instrumentation_begin();
 351
 352                 offset = msr_to_offset(msr);
 353                 if (offset < 0)
 354                         ret = 0;
 355                 else
 356                         ret = *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
 357
 358                 instrumentation_end();
 359
 360                 return ret;
 361         }
 362
 363         /*
 364          * RDMSR on MCA MSRs should not fault. If they do, this is very much an
 365          * architectural violation and needs to be reported to hw vendor. Panic
 366          * the box to not allow any further progress.
 367          */
 368         asm volatile("1: rdmsr\n"
 369                      "2:\n"
 370                      _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR_IN_MCE)
 371                      : EAX_EDX_RET(val, low, high) : "c" (msr));
 372
 373
 374         return EAX_EDX_VAL(val, low, high);
 375 }
 376
 377 static noinstr void mce_wrmsrl(u32 msr, u64 v)
 378 {
 379         u32 low, high;
 380
 381         if (__this_cpu_read(injectm.finished)) {
 382                 int offset;
 383
 384                 instrumentation_begin();
 385
 386                 offset = msr_to_offset(msr);
 387                 if (offset >= 0)
 388                         *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
 389
 390                 instrumentation_end();
 391
 392                 return;
 393         }
 394
 395         low  = (u32)v;
 396         high = (u32)(v >> 32);
 397
 398         /* See comment in mce_rdmsrl() */
 399         asm volatile("1: wrmsr\n"
 400                      "2:\n"
 401                      _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE)
 402                      : : "c" (msr), "a"(low), "d" (high) : "memory");
 403 }
 404
 405 /*
 406  * Collect all global (w.r.t. this processor) status about this machine
 407  * check into our "mce" struct so that we can use it later to assess
 408  * the severity of the problem as we read per-bank specific details.
 409  */
 410 static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs)
 411 {
 412         /*
 413          * Enable instrumentation around mce_setup() which calls external
 414          * facilities.
 415          */
 416         instrumentation_begin();
 417         mce_setup(m);
 418         instrumentation_end();
 419
 420         m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 421         if (regs) {
 422                 /*
 423                  * Get the address of the instruction at the time of
 424                  * the machine check error.
 425                  */
 426                 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
 427                         m->ip = regs->ip;
 428                         m->cs = regs->cs;
 429
 430                         /*
 431                          * When in VM86 mode make the cs look like ring 3
 432                          * always. This is a lie, but it's better than passing
 433                          * the additional vm86 bit around everywhere.
 434                          */
 435                         if (v8086_mode(regs))
 436                                 m->cs |= 3;
 437                 }
 438                 /* Use accurate RIP reporting if available. */
 439                 if (mca_cfg.rip_msr)
 440                         m->ip = mce_rdmsrl(mca_cfg.rip_msr);
 441         }
 442 }
 443
 444 int mce_available(struct cpuinfo_x86 *c)
 445 {
 446         if (mca_cfg.disabled)
 447                 return 0;
 448         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 449 }
 450
 451 static void mce_schedule_work(void)
 452 {
 453         if (!mce_gen_pool_empty())
 454                 schedule_work(&mce_work);
 455 }
 456
 457 static void mce_irq_work_cb(struct irq_work *entry)
 458 {
 459         mce_schedule_work();
 460 }
 461
 462 /*
 463  * Check if the address reported by the CPU is in a format we can parse.
 464  * It would be possible to add code for most other cases, but all would
 465  * be somewhat complicated (e.g. segment offset would require an instruction
 466  * parser). So only support physical addresses up to page granularity for now.
 467  */
 468 int mce_usable_address(struct mce *m)
 469 {
 470         if (!(m->status & MCI_STATUS_ADDRV))
 471                 return 0;
 472
 473         /* Checks after this one are Intel/Zhaoxin-specific: */
 474         if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
 475             boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
 476                 return 1;
 477
 478         if (!(m->status & MCI_STATUS_MISCV))
 479                 return 0;
 480
 481         if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
 482                 return 0;
 483
 484         if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
 485                 return 0;
 486
 487         return 1;
 488 }
 489 EXPORT_SYMBOL_GPL(mce_usable_address);
 490
 491 bool mce_is_memory_error(struct mce *m)
 492 {
 493         switch (m->cpuvendor) {
 494         case X86_VENDOR_AMD:
 495         case X86_VENDOR_HYGON:
 496                 return amd_mce_is_memory_error(m);
 497
 498         case X86_VENDOR_INTEL:
 499         case X86_VENDOR_ZHAOXIN:
 500                 /*
 501                  * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
 502                  *
 503                  * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
 504                  * indicating a memory error. Bit 8 is used for indicating a
 505                  * cache hierarchy error. The combination of bit 2 and bit 3
 506                  * is used for indicating a `generic' cache hierarchy error
 507                  * But we can't just blindly check the above bits, because if
 508                  * bit 11 is set, then it is a bus/interconnect error - and
 509                  * either way the above bits just gives more detail on what
 510                  * bus/interconnect error happened. Note that bit 12 can be
 511                  * ignored, as it's the "filter" bit.
 512                  */
 513                 return (m->status & 0xef80) == BIT(7) ||
 514                        (m->status & 0xef00) == BIT(8) ||
 515                        (m->status & 0xeffc) == 0xc;
 516
 517         default:
 518                 return false;
 519         }
 520 }
 521 EXPORT_SYMBOL_GPL(mce_is_memory_error);
 522
 523 static bool whole_page(struct mce *m)
 524 {
 525         if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV))
 526                 return true;
 527
 528         return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
 529 }
 530
 531 bool mce_is_correctable(struct mce *m)
 532 {
 533         if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
 534                 return false;
 535
 536         if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED)
 537                 return false;
 538
 539         if (m->status & MCI_STATUS_UC)
 540                 return false;
 541
 542         return true;
 543 }
 544 EXPORT_SYMBOL_GPL(mce_is_correctable);
 545
 546 static int mce_early_notifier(struct notifier_block *nb, unsigned long val,
 547                               void *data)
 548 {
 549         struct mce *m = (struct mce *)data;
 550
 551         if (!m)
 552                 return NOTIFY_DONE;
 553
 554         /* Emit the trace record: */
 555         trace_mce_record(m);
 556
 557         set_bit(0, &mce_need_notify);
 558
 559         mce_notify_irq();
 560
 561         return NOTIFY_DONE;
 562 }
 563
 564 static struct notifier_block early_nb = {
 565         .notifier_call  = mce_early_notifier,
 566         .priority       = MCE_PRIO_EARLY,
 567 };
 568
 569 static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
 570                               void *data)
 571 {
 572         struct mce *mce = (struct mce *)data;
 573         unsigned long pfn;
 574
 575         if (!mce || !mce_usable_address(mce))
 576                 return NOTIFY_DONE;
 577
 578         if (mce->severity != MCE_AO_SEVERITY &&
 579             mce->severity != MCE_DEFERRED_SEVERITY)
 580                 return NOTIFY_DONE;
 581
 582         pfn = mce->addr >> PAGE_SHIFT;
 583         if (!memory_failure(pfn, 0)) {
 584                 set_mce_nospec(pfn);
 585                 mce->kflags |= MCE_HANDLED_UC;
 586         }
 587
 588         return NOTIFY_OK;
 589 }
 590
 591 static struct notifier_block mce_uc_nb = {
 592         .notifier_call  = uc_decode_notifier,
 593         .priority       = MCE_PRIO_UC,
 594 };
 595
 596 static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
 597                                 void *data)
 598 {
 599         struct mce *m = (struct mce *)data;
 600
 601         if (!m)
 602                 return NOTIFY_DONE;
 603
 604         if (mca_cfg.print_all || !m->kflags)
 605                 __print_mce(m);
 606
 607         return NOTIFY_DONE;
 608 }
 609
 610 static struct notifier_block mce_default_nb = {
 611         .notifier_call  = mce_default_notifier,
 612         /* lowest prio, we want it to run last. */
 613         .priority       = MCE_PRIO_LOWEST,
 614 };
 615
 616 /*
 617  * Read ADDR and MISC registers.
 618  */
 619 static noinstr void mce_read_aux(struct mce *m, int i)
 620 {
 621         if (m->status & MCI_STATUS_MISCV)
 622                 m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC));
 623
 624         if (m->status & MCI_STATUS_ADDRV) {
 625                 m->addr = mce_rdmsrl(mca_msr_reg(i, MCA_ADDR));
 626
 627                 /*
 628                  * Mask the reported address by the reported granularity.
 629                  */
 630                 if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
 631                         u8 shift = MCI_MISC_ADDR_LSB(m->misc);
 632                         m->addr >>= shift;
 633                         m->addr <<= shift;
 634                 }
 635
 636                 /*
 637                  * Extract [55:<lsb>] where lsb is the least significant
 638                  * *valid* bit of the address bits.
 639                  */
 640                 if (mce_flags.smca) {
 641                         u8 lsb = (m->addr >> 56) & 0x3f;
 642
 643                         m->addr &= GENMASK_ULL(55, lsb);
 644                 }
 645         }
 646
 647         if (mce_flags.smca) {
 648                 m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
 649
 650                 if (m->status & MCI_STATUS_SYNDV)
 651                         m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
 652         }
 653 }
 654
 655 DEFINE_PER_CPU(unsigned, mce_poll_count);
 656
 657 /*
 658  * Poll for corrected events or events that happened before reset.
 659  * Those are just logged through /dev/mcelog.
 660  *
 661  * This is executed in standard interrupt context.
 662  *
 663  * Note: spec recommends to panic for fatal unsignalled
 664  * errors here. However this would be quite problematic --
 665  * we would need to reimplement the Monarch handling and
 666  * it would mess up the exclusion between exception handler
 667  * and poll handler -- * so we skip this for now.
 668  * These cases should not happen anyways, or only when the CPU
 669  * is already totally * confused. In this case it's likely it will
 670  * not fully execute the machine check handler either.
 671  */
 672 bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 673 {
 674         struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
 675         bool error_seen = false;
 676         struct mce m;
 677         int i;
 678
 679         this_cpu_inc(mce_poll_count);
 680
 681         mce_gather_info(&m, NULL);
 682
 683         if (flags & MCP_TIMESTAMP)
 684                 m.tsc = rdtsc();
 685
 686         for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
 687                 if (!mce_banks[i].ctl || !test_bit(i, *b))
 688                         continue;
 689
 690                 m.misc = 0;
 691                 m.addr = 0;
 692                 m.bank = i;
 693
 694                 barrier();
 695                 m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
 696
 697                 /* If this entry is not valid, ignore it */
 698                 if (!(m.status & MCI_STATUS_VAL))
 699                         continue;
 700
 701                 /*
 702                  * If we are logging everything (at CPU online) or this
 703                  * is a corrected error, then we must log it.
 704                  */
 705                 if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC))
 706                         goto log_it;
 707
 708                 /*
 709                  * Newer Intel systems that support software error
 710                  * recovery need to make additional checks. Other
 711                  * CPUs should skip over uncorrected errors, but log
 712                  * everything else.
 713                  */
 714                 if (!mca_cfg.ser) {
 715                         if (m.status & MCI_STATUS_UC)
 716                                 continue;
 717                         goto log_it;
 718                 }
 719
 720                 /* Log "not enabled" (speculative) errors */
 721                 if (!(m.status & MCI_STATUS_EN))
 722                         goto log_it;
 723
 724                 /*
 725                  * Log UCNA (SDM: 15.6.3 "UCR Error Classification")
 726                  * UC == 1 && PCC == 0 && S == 0
 727                  */
 728                 if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S))
 729                         goto log_it;
 730
 731                 /*
 732                  * Skip anything else. Presumption is that our read of this
 733                  * bank is racing with a machine check. Leave the log alone
 734                  * for do_machine_check() to deal with it.
 735                  */
 736                 continue;
 737
 738 log_it:
 739                 error_seen = true;
 740
 741                 if (flags & MCP_DONTLOG)
 742                         goto clear_it;
 743
 744                 mce_read_aux(&m, i);
 745                 m.severity = mce_severity(&m, NULL, NULL, false);
 746                 /*
 747                  * Don't get the IP here because it's unlikely to
 748                  * have anything to do with the actual error location.
 749                  */
 750
 751                 if (mca_cfg.dont_log_ce && !mce_usable_address(&m))
 752                         goto clear_it;
 753
 754                 if (flags & MCP_QUEUE_LOG)
 755                         mce_gen_pool_add(&m);
 756                 else
 757                         mce_log(&m);
 758
 759 clear_it:
 760                 /*
 761                  * Clear state for this bank.
 762                  */
 763                 mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
 764         }
 765
 766         /*
 767          * Don't clear MCG_STATUS here because it's only defined for
 768          * exceptions.
 769          */
 770
 771         sync_core();
 772
 773         return error_seen;
 774 }
 775 EXPORT_SYMBOL_GPL(machine_check_poll);
 776
 777 /*
 778  * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
 779  * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
 780  * Vol 3B Table 15-20). But this confuses both the code that determines
 781  * whether the machine check occurred in kernel or user mode, and also
 782  * the severity assessment code. Pretend that EIPV was set, and take the
 783  * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
 784  */
 785 static __always_inline void
 786 quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
 787 {
 788         if (bank != 0)
 789                 return;
 790         if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
 791                 return;
 792         if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
 793                           MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
 794                           MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
 795                           MCACOD)) !=
 796                          (MCI_STATUS_UC|MCI_STATUS_EN|
 797                           MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
 798                           MCI_STATUS_AR|MCACOD_INSTR))
 799                 return;
 800
 801         m->mcgstatus |= MCG_STATUS_EIPV;
 802         m->ip = regs->ip;
 803         m->cs = regs->cs;
 804 }
 805
 806 /*
 807  * Disable fast string copy and return from the MCE handler upon the first SRAR
 808  * MCE on bank 1 due to a CPU erratum on Intel Skylake/Cascade Lake/Cooper Lake
 809  * CPUs.
 810  * The fast string copy instructions ("REP; MOVS*") could consume an
 811  * uncorrectable memory error in the cache line _right after_ the desired region
 812  * to copy and raise an MCE with RIP pointing to the instruction _after_ the
 813  * "REP; MOVS*".
 814  * This mitigation addresses the issue completely with the caveat of performance
 815  * degradation on the CPU affected. This is still better than the OS crashing on
 816  * MCEs raised on an irrelevant process due to "REP; MOVS*" accesses from a
 817  * kernel context (e.g., copy_page).
 818  *
 819  * Returns true when fast string copy on CPU has been disabled.
 820  */
 821 static noinstr bool quirk_skylake_repmov(void)
 822 {
 823         u64 mcgstatus   = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 824         u64 misc_enable = mce_rdmsrl(MSR_IA32_MISC_ENABLE);
 825         u64 mc1_status;
 826
 827         /*
 828          * Apply the quirk only to local machine checks, i.e., no broadcast
 829          * sync is needed.
 830          */
 831         if (!(mcgstatus & MCG_STATUS_LMCES) ||
 832             !(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING))
 833                 return false;
 834
 835         mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));
 836
 837         /* Check for a software-recoverable data fetch error. */
 838         if ((mc1_status &
 839              (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN |
 840               MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC |
 841               MCI_STATUS_AR | MCI_STATUS_S)) ==
 842              (MCI_STATUS_VAL |                   MCI_STATUS_UC | MCI_STATUS_EN |
 843               MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
 844               MCI_STATUS_AR | MCI_STATUS_S)) {
 845                 misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
 846                 mce_wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
 847                 mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0);
 848
 849                 instrumentation_begin();
 850                 pr_err_once("Erratum detected, disable fast string copy instructions.\n");
 851                 instrumentation_end();
 852
 853                 return true;
 854         }
 855
 856         return false;
 857 }
 858
 859 /*
 860  * Do a quick check if any of the events requires a panic.
 861  * This decides if we keep the events around or clear them.
 862  */
 863 static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
 864                                           struct pt_regs *regs)
 865 {
 866         char *tmp = *msg;
 867         int i;
 868
 869         for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
 870                 m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
 871                 if (!(m->status & MCI_STATUS_VAL))
 872                         continue;
 873
 874                 arch___set_bit(i, validp);
 875                 if (mce_flags.snb_ifu_quirk)
 876                         quirk_sandybridge_ifu(i, m, regs);
 877
 878                 m->bank = i;
 879                 if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) {
 880                         mce_read_aux(m, i);
 881                         *msg = tmp;
 882                         return 1;
 883                 }
 884         }
 885         return 0;
 886 }
 887
 888 /*
 889  * Variable to establish order between CPUs while scanning.
 890  * Each CPU spins initially until executing is equal its number.
 891  */
 892 static atomic_t mce_executing;
 893
 894 /*
 895  * Defines order of CPUs on entry. First CPU becomes Monarch.
 896  */
 897 static atomic_t mce_callin;
 898
 899 /*
 900  * Track which CPUs entered the MCA broadcast synchronization and which not in
 901  * order to print holdouts.
 902  */
 903 static cpumask_t mce_missing_cpus = CPU_MASK_ALL;
 904
 905 /*
 906  * Check if a timeout waiting for other CPUs happened.
 907  */
 908 static noinstr int mce_timed_out(u64 *t, const char *msg)
 909 {
 910         int ret = 0;
 911
 912         /* Enable instrumentation around calls to external facilities */
 913         instrumentation_begin();
 914
 915         /*
 916          * The others already did panic for some reason.
 917          * Bail out like in a timeout.
 918          * rmb() to tell the compiler that system_state
 919          * might have been modified by someone else.
 920          */
 921         rmb();
 922         if (atomic_read(&mce_panicked))
 923                 wait_for_panic();
 924         if (!mca_cfg.monarch_timeout)
 925                 goto out;
 926         if ((s64)*t < SPINUNIT) {
 927                 if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus))
 928                         pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
 929                                  cpumask_pr_args(&mce_missing_cpus));
 930                 mce_panic(msg, NULL, NULL);
 931
 932                 ret = 1;
 933                 goto out;
 934         }
 935         *t -= SPINUNIT;
 936
 937 out:
 938         touch_nmi_watchdog();
 939
 940         instrumentation_end();
 941
 942         return ret;
 943 }
 944
 945 /*
 946  * The Monarch's reign.  The Monarch is the CPU who entered
 947  * the machine check handler first. It waits for the others to
 948  * raise the exception too and then grades them. When any
 949  * error is fatal panic. Only then let the others continue.
 950  *
 951  * The other CPUs entering the MCE handler will be controlled by the
 952  * Monarch. They are called Subjects.
 953  *
 954  * This way we prevent any potential data corruption in a unrecoverable case
 955  * and also makes sure always all CPU's errors are examined.
 956  *
 957  * Also this detects the case of a machine check event coming from outer
 958  * space (not detected by any CPUs) In this case some external agent wants
 959  * us to shut down, so panic too.
 960  *
 961  * The other CPUs might still decide to panic if the handler happens
 962  * in a unrecoverable place, but in this case the system is in a semi-stable
 963  * state and won't corrupt anything by itself. It's ok to let the others
 964  * continue for a bit first.
 965  *
 966  * All the spin loops have timeouts; when a timeout happens a CPU
 967  * typically elects itself to be Monarch.
 968  */
 969 static void mce_reign(void)
 970 {
 971         int cpu;
 972         struct mce *m = NULL;
 973         int global_worst = 0;
 974         char *msg = NULL;
 975
 976         /*
 977          * This CPU is the Monarch and the other CPUs have run
 978          * through their handlers.
 979          * Grade the severity of the errors of all the CPUs.
 980          */
 981         for_each_possible_cpu(cpu) {
 982                 struct mce *mtmp = &per_cpu(mces_seen, cpu);
 983
 984                 if (mtmp->severity > global_worst) {
 985                         global_worst = mtmp->severity;
 986                         m = &per_cpu(mces_seen, cpu);
 987                 }
 988         }
 989
 990         /*
 991          * Cannot recover? Panic here then.
 992          * This dumps all the mces in the log buffer and stops the
 993          * other CPUs.
 994          */
 995         if (m && global_worst >= MCE_PANIC_SEVERITY) {
 996                 /* call mce_severity() to get "msg" for panic */
 997                 mce_severity(m, NULL, &msg, true);
 998                 mce_panic("Fatal machine check", m, msg);
 999         }
1000
1001         /*
1002          * For UC somewhere we let the CPU who detects it handle it.
1003          * Also must let continue the others, otherwise the handling
1004          * CPU could deadlock on a lock.
1005          */
1006
1007         /*
1008          * No machine check event found. Must be some external
1009          * source or one CPU is hung. Panic.
1010          */
1011         if (global_worst <= MCE_KEEP_SEVERITY)
1012                 mce_panic("Fatal machine check from unknown source", NULL, NULL);
1013
1014         /*
1015          * Now clear all the mces_seen so that they don't reappear on
1016          * the next mce.
1017          */
1018         for_each_possible_cpu(cpu)
1019                 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
1020 }
1021
1022 static atomic_t global_nwo;
1023
1024 /*
1025  * Start of Monarch synchronization. This waits until all CPUs have
1026  * entered the exception handler and then determines if any of them
1027  * saw a fatal event that requires panic. Then it executes them
1028  * in the entry order.
1029  * TBD double check parallel CPU hotunplug
1030  */
1031 static noinstr int mce_start(int *no_way_out)
1032 {
1033         u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
1034         int order, ret = -1;
1035
1036         if (!timeout)
1037                 return ret;
1038
1039         arch_atomic_add(*no_way_out, &global_nwo);
1040         /*
1041          * Rely on the implied barrier below, such that global_nwo
1042          * is updated before mce_callin.
1043          */
1044         order = arch_atomic_inc_return(&mce_callin);
1045         arch_cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus);
1046
1047         /* Enable instrumentation around calls to external facilities */
1048         instrumentation_begin();
1049
1050         /*
1051          * Wait for everyone.
1052          */
1053         while (arch_atomic_read(&mce_callin) != num_online_cpus()) {
1054                 if (mce_timed_out(&timeout,
1055                                   "Timeout: Not all CPUs entered broadcast exception handler")) {
1056                         arch_atomic_set(&global_nwo, 0);
1057                         goto out;
1058                 }
1059                 ndelay(SPINUNIT);
1060         }
1061
1062         /*
1063          * mce_callin should be read before global_nwo
1064          */
1065         smp_rmb();
1066
1067         if (order == 1) {
1068                 /*
1069                  * Monarch: Starts executing now, the others wait.
1070                  */
1071                 arch_atomic_set(&mce_executing, 1);
1072         } else {
1073                 /*
1074                  * Subject: Now start the scanning loop one by one in
1075                  * the original callin order.
1076                  * This way when there are any shared banks it will be
1077                  * only seen by one CPU before cleared, avoiding duplicates.
1078                  */
1079                 while (arch_atomic_read(&mce_executing) < order) {
1080                         if (mce_timed_out(&timeout,
1081                                           "Timeout: Subject CPUs unable to finish machine check processing")) {
1082                                 arch_atomic_set(&global_nwo, 0);
1083                                 goto out;
1084                         }
1085                         ndelay(SPINUNIT);
1086                 }
1087         }
1088
1089         /*
1090          * Cache the global no_way_out state.
1091          */
1092         *no_way_out = arch_atomic_read(&global_nwo);
1093
1094         ret = order;
1095
1096 out:
1097         instrumentation_end();
1098
1099         return ret;
1100 }
1101
1102 /*
1103  * Synchronize between CPUs after main scanning loop.
1104  * This invokes the bulk of the Monarch processing.
1105  */
1106 static noinstr int mce_end(int order)
1107 {
1108         u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
1109         int ret = -1;
1110
1111         /* Allow instrumentation around external facilities. */
1112         instrumentation_begin();
1113
1114         if (!timeout)
1115                 goto reset;
1116         if (order < 0)
1117                 goto reset;
1118
1119         /*
1120          * Allow others to run.
1121          */
1122         atomic_inc(&mce_executing);
1123
1124         if (order == 1) {
1125                 /*
1126                  * Monarch: Wait for everyone to go through their scanning
1127                  * loops.
1128                  */
1129                 while (atomic_read(&mce_executing) <= num_online_cpus()) {
1130                         if (mce_timed_out(&timeout,
1131                                           "Timeout: Monarch CPU unable to finish machine check processing"))
1132                                 goto reset;
1133                         ndelay(SPINUNIT);
1134                 }
1135
1136                 mce_reign();
1137                 barrier();
1138                 ret = 0;
1139         } else {
1140                 /*
1141                  * Subject: Wait for Monarch to finish.
1142                  */
1143                 while (atomic_read(&mce_executing) != 0) {
1144                         if (mce_timed_out(&timeout,
1145                                           "Timeout: Monarch CPU did not finish machine check processing"))
1146                                 goto reset;
1147                         ndelay(SPINUNIT);
1148                 }
1149
1150                 /*
1151                  * Don't reset anything. That's done by the Monarch.
1152                  */
1153                 ret = 0;
1154                 goto out;
1155         }
1156
1157         /*
1158          * Reset all global state.
1159          */
1160 reset:
1161         atomic_set(&global_nwo, 0);
1162         atomic_set(&mce_callin, 0);
1163         cpumask_setall(&mce_missing_cpus);
1164         barrier();
1165
1166         /*
1167          * Let others run again.
1168          */
1169         atomic_set(&mce_executing, 0);
1170
1171 out:
1172         instrumentation_end();
1173
1174         return ret;
1175 }
1176
1177 static __always_inline void mce_clear_state(unsigned long *toclear)
1178 {
1179         int i;
1180
1181         for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
1182                 if (arch_test_bit(i, toclear))
1183                         mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
1184         }
1185 }
1186
1187 /*
1188  * Cases where we avoid rendezvous handler timeout:
1189  * 1) If this CPU is offline.
1190  *
1191  * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
1192  *  skip those CPUs which remain looping in the 1st kernel - see
1193  *  crash_nmi_callback().
1194  *
1195  * Note: there still is a small window between kexec-ing and the new,
1196  * kdump kernel establishing a new #MC handler where a broadcasted MCE
1197  * might not get handled properly.
1198  */
1199 static noinstr bool mce_check_crashing_cpu(void)
1200 {
1201         unsigned int cpu = smp_processor_id();
1202
1203         if (arch_cpu_is_offline(cpu) ||
1204             (crashing_cpu != -1 && crashing_cpu != cpu)) {
1205                 u64 mcgstatus;
1206
1207                 mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS);
1208
1209                 if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
1210                         if (mcgstatus & MCG_STATUS_LMCES)
1211                                 return false;
1212                 }
1213
1214                 if (mcgstatus & MCG_STATUS_RIPV) {
1215                         __wrmsr(MSR_IA32_MCG_STATUS, 0, 0);
1216                         return true;
1217                 }
1218         }
1219         return false;
1220 }
1221
1222 static __always_inline int
1223 __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
1224                 unsigned long *toclear, unsigned long *valid_banks, int no_way_out,
1225                 int *worst)
1226 {
1227         struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1228         struct mca_config *cfg = &mca_cfg;
1229         int severity, i, taint = 0;
1230
1231         for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
1232                 arch___clear_bit(i, toclear);
1233                 if (!arch_test_bit(i, valid_banks))
1234                         continue;
1235
1236                 if (!mce_banks[i].ctl)
1237                         continue;
1238
1239                 m->misc = 0;
1240                 m->addr = 0;
1241                 m->bank = i;
1242
1243                 m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
1244                 if (!(m->status & MCI_STATUS_VAL))
1245                         continue;
1246
1247                 /*
1248                  * Corrected or non-signaled errors are handled by
1249                  * machine_check_poll(). Leave them alone, unless this panics.
1250                  */
1251                 if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1252                         !no_way_out)
1253                         continue;
1254
1255                 /* Set taint even when machine check was not enabled. */
1256                 taint++;
1257
1258                 severity = mce_severity(m, regs, NULL, true);
1259
1260                 /*
1261                  * When machine check was for corrected/deferred handler don't
1262                  * touch, unless we're panicking.
1263                  */
1264                 if ((severity == MCE_KEEP_SEVERITY ||
1265                      severity == MCE_UCNA_SEVERITY) && !no_way_out)
1266                         continue;
1267
1268                 arch___set_bit(i, toclear);
1269
1270                 /* Machine check event was not enabled. Clear, but ignore. */
1271                 if (severity == MCE_NO_SEVERITY)
1272                         continue;
1273
1274                 mce_read_aux(m, i);
1275
1276                 /* assuming valid severity level != 0 */
1277                 m->severity = severity;
1278
1279                 /*
1280                  * Enable instrumentation around the mce_log() call which is
1281                  * done in #MC context, where instrumentation is disabled.
1282                  */
1283                 instrumentation_begin();
1284                 mce_log(m);
1285                 instrumentation_end();
1286
1287                 if (severity > *worst) {
1288                         *final = *m;
1289                         *worst = severity;
1290                 }
1291         }
1292
1293         /* mce_clear_state will clear *final, save locally for use later */
1294         *m = *final;
1295
1296         return taint;
1297 }
1298
1299 static void kill_me_now(struct callback_head *ch)
1300 {
1301         struct task_struct *p = container_of(ch, struct task_struct, mce_kill_me);
1302
1303         p->mce_count = 0;
1304         force_sig(SIGBUS);
1305 }
1306
1307 static void kill_me_maybe(struct callback_head *cb)
1308 {
1309         struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
1310         int flags = MF_ACTION_REQUIRED;
1311         int ret;
1312
1313         p->mce_count = 0;
1314         pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
1315
1316         if (!p->mce_ripv)
1317                 flags |= MF_MUST_KILL;
1318
1319         ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
1320         if (!ret) {
1321                 set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
1322                 sync_core();
1323                 return;
1324         }
1325
1326         /*
1327          * -EHWPOISON from memory_failure() means that it already sent SIGBUS
1328          * to the current process with the proper error info,
1329          * -EOPNOTSUPP means hwpoison_filter() filtered the error event,
1330          *
1331          * In both cases, no further processing is required.
1332          */
1333         if (ret == -EHWPOISON || ret == -EOPNOTSUPP)
1334                 return;
1335
1336         pr_err("Memory error not recovered");
1337         kill_me_now(cb);
1338 }
1339
1340 static void kill_me_never(struct callback_head *cb)
1341 {
1342         struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
1343
1344         p->mce_count = 0;
1345         pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
1346         if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0))
1347                 set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
1348 }
1349
1350 static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
1351 {
1352         int count = ++current->mce_count;
1353
1354         /* First call, save all the details */
1355         if (count == 1) {
1356                 current->mce_addr = m->addr;
1357                 current->mce_kflags = m->kflags;
1358                 current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
1359                 current->mce_whole_page = whole_page(m);
1360                 current->mce_kill_me.func = func;
1361         }
1362
1363         /* Ten is likely overkill. Don't expect more than two faults before task_work() */
1364         if (count > 10)
1365                 mce_panic("Too many consecutive machine checks while accessing user data", m, msg);
1366
1367         /* Second or later call, make sure page address matches the one from first call */
1368         if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
1369                 mce_panic("Consecutive machine checks to different user pages", m, msg);
1370
1371         /* Do not call task_work_add() more than once */
1372         if (count > 1)
1373                 return;
1374
1375         task_work_add(current, &current->mce_kill_me, TWA_RESUME);
1376 }
1377
1378 /* Handle unconfigured int18 (should never happen) */
1379 static noinstr void unexpected_machine_check(struct pt_regs *regs)
1380 {
1381         instrumentation_begin();
1382         pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1383                smp_processor_id());
1384         instrumentation_end();
1385 }
1386
1387 /*
1388  * The actual machine check handler. This only handles real exceptions when
1389  * something got corrupted coming in through int 18.
1390  *
1391  * This is executed in #MC context not subject to normal locking rules.
1392  * This implies that most kernel services cannot be safely used. Don't even
1393  * think about putting a printk in there!
1394  *
1395  * On Intel systems this is entered on all CPUs in parallel through
1396  * MCE broadcast. However some CPUs might be broken beyond repair,
1397  * so be always careful when synchronizing with others.
1398  *
1399  * Tracing and kprobes are disabled: if we interrupted a kernel context
1400  * with IF=1, we need to minimize stack usage.  There are also recursion
1401  * issues: if the machine check was due to a failure of the memory
1402  * backing the user stack, tracing that reads the user stack will cause
1403  * potentially infinite recursion.
1404  *
1405  * Currently, the #MC handler calls out to a number of external facilities
1406  * and, therefore, allows instrumentation around them. The optimal thing to
1407  * have would be to do the absolutely minimal work required in #MC context
1408  * and have instrumentation disabled only around that. Further processing can
1409  * then happen in process context where instrumentation is allowed. Achieving
1410  * that requires careful auditing and modifications. Until then, the code
1411  * allows instrumentation temporarily, where required. *
1412  */
1413 noinstr void do_machine_check(struct pt_regs *regs)
1414 {
1415         int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0;
1416         DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 };
1417         DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 };
1418         struct mce m, *final;
1419         char *msg = NULL;
1420
1421         if (unlikely(mce_flags.p5))
1422                 return pentium_machine_check(regs);
1423         else if (unlikely(mce_flags.winchip))
1424                 return winchip_machine_check(regs);
1425         else if (unlikely(!mca_cfg.initialized))
1426                 return unexpected_machine_check(regs);
1427
1428         if (mce_flags.skx_repmov_quirk && quirk_skylake_repmov())
1429                 goto clear;
1430
1431         /*
1432          * Establish sequential order between the CPUs entering the machine
1433          * check handler.
1434          */
1435         order = -1;
1436
1437         /*
1438          * If no_way_out gets set, there is no safe way to recover from this
1439          * MCE.
1440          */
1441         no_way_out = 0;
1442
1443         /*
1444          * If kill_current_task is not set, there might be a way to recover from this
1445          * error.
1446          */
1447         kill_current_task = 0;
1448
1449         /*
1450          * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
1451          * on Intel.
1452          */
1453         lmce = 1;
1454
1455         this_cpu_inc(mce_exception_count);
1456
1457         mce_gather_info(&m, regs);
1458         m.tsc = rdtsc();
1459
1460         final = this_cpu_ptr(&mces_seen);
1461         *final = m;
1462
1463         no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
1464
1465         barrier();
1466
1467         /*
1468          * When no restart IP might need to kill or panic.
1469          * Assume the worst for now, but if we find the
1470          * severity is MCE_AR_SEVERITY we have other options.
1471          */
1472         if (!(m.mcgstatus & MCG_STATUS_RIPV))
1473                 kill_current_task = 1;
1474         /*
1475          * Check if this MCE is signaled to only this logical processor,
1476          * on Intel, Zhaoxin only.
1477          */
1478         if (m.cpuvendor == X86_VENDOR_INTEL ||
1479             m.cpuvendor == X86_VENDOR_ZHAOXIN)
1480                 lmce = m.mcgstatus & MCG_STATUS_LMCES;
1481
1482         /*
1483          * Local machine check may already know that we have to panic.
1484          * Broadcast machine check begins rendezvous in mce_start()
1485          * Go through all banks in exclusion of the other CPUs. This way we
1486          * don't report duplicated events on shared banks because the first one
1487          * to see it will clear it.
1488          */
1489         if (lmce) {
1490                 if (no_way_out)
1491                         mce_panic("Fatal local machine check", &m, msg);
1492         } else {
1493                 order = mce_start(&no_way_out);
1494         }
1495
1496         taint = __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst);
1497
1498         if (!no_way_out)
1499                 mce_clear_state(toclear);
1500
1501         /*
1502          * Do most of the synchronization with other CPUs.
1503          * When there's any problem use only local no_way_out state.
1504          */
1505         if (!lmce) {
1506                 if (mce_end(order) < 0) {
1507                         if (!no_way_out)
1508                                 no_way_out = worst >= MCE_PANIC_SEVERITY;
1509
1510                         if (no_way_out)
1511                                 mce_panic("Fatal machine check on current CPU", &m, msg);
1512                 }
1513         } else {
1514                 /*
1515                  * If there was a fatal machine check we should have
1516                  * already called mce_panic earlier in this function.
1517                  * Since we re-read the banks, we might have found
1518                  * something new. Check again to see if we found a
1519                  * fatal error. We call "mce_severity()" again to
1520                  * make sure we have the right "msg".
1521                  */
1522                 if (worst >= MCE_PANIC_SEVERITY) {
1523                         mce_severity(&m, regs, &msg, true);
1524                         mce_panic("Local fatal machine check!", &m, msg);
1525                 }
1526         }
1527
1528         /*
1529          * Enable instrumentation around the external facilities like task_work_add()
1530          * (via queue_task_work()), fixup_exception() etc. For now, that is. Fixing this
1531          * properly would need a lot more involved reorganization.
1532          */
1533         instrumentation_begin();
1534
1535         if (taint)
1536                 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1537
1538         if (worst != MCE_AR_SEVERITY && !kill_current_task)
1539                 goto out;
1540
1541         /* Fault was in user mode and we need to take some action */
1542         if ((m.cs & 3) == 3) {
1543                 /* If this triggers there is no way to recover. Die hard. */
1544                 BUG_ON(!on_thread_stack() || !user_mode(regs));
1545
1546                 if (kill_current_task)
1547                         queue_task_work(&m, msg, kill_me_now);
1548                 else
1549                         queue_task_work(&m, msg, kill_me_maybe);
1550
1551         } else {
1552                 /*
1553                  * Handle an MCE which has happened in kernel space but from
1554                  * which the kernel can recover: ex_has_fault_handler() has
1555                  * already verified that the rIP at which the error happened is
1556                  * a rIP from which the kernel can recover (by jumping to
1557                  * recovery code specified in _ASM_EXTABLE_FAULT()) and the
1558                  * corresponding exception handler which would do that is the
1559                  * proper one.
1560                  */
1561                 if (m.kflags & MCE_IN_KERNEL_RECOV) {
1562                         if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
1563                                 mce_panic("Failed kernel mode recovery", &m, msg);
1564                 }
1565
1566                 if (m.kflags & MCE_IN_KERNEL_COPYIN)
1567                         queue_task_work(&m, msg, kill_me_never);
1568         }
1569
1570 out:
1571         instrumentation_end();
1572
1573 clear:
1574         mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1575 }
1576 EXPORT_SYMBOL_GPL(do_machine_check);
1577
1578 #ifndef CONFIG_MEMORY_FAILURE
1579 int memory_failure(unsigned long pfn, int flags)
1580 {
1581         /* mce_severity() should not hand us an ACTION_REQUIRED error */
1582         BUG_ON(flags & MF_ACTION_REQUIRED);
1583         pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1584                "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1585                pfn);
1586
1587         return 0;
1588 }
1589 #endif
1590
1591 /*
1592  * Periodic polling timer for "silent" machine check errors.  If the
1593  * poller finds an MCE, poll 2x faster.  When the poller finds no more
1594  * errors, poll 2x slower (up to check_interval seconds).
1595  */
1596 static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
1597
1598 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1599 static DEFINE_PER_CPU(struct timer_list, mce_timer);
1600
1601 static unsigned long mce_adjust_timer_default(unsigned long interval)
1602 {
1603         return interval;
1604 }
1605
1606 static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
1607
1608 static void __start_timer(struct timer_list *t, unsigned long interval)
1609 {
1610         unsigned long when = jiffies + interval;
1611         unsigned long flags;
1612
1613         local_irq_save(flags);
1614
1615         if (!timer_pending(t) || time_before(when, t->expires))
1616                 mod_timer(t, round_jiffies(when));
1617
1618         local_irq_restore(flags);
1619 }
1620
1621 static void mce_timer_fn(struct timer_list *t)
1622 {
1623         struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
1624         unsigned long iv;
1625
1626         WARN_ON(cpu_t != t);
1627
1628         iv = __this_cpu_read(mce_next_interval);
1629
1630         if (mce_available(this_cpu_ptr(&cpu_info))) {
1631                 machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
1632
1633                 if (mce_intel_cmci_poll()) {
1634                         iv = mce_adjust_timer(iv);
1635                         goto done;
1636                 }
1637         }
1638
1639         /*
1640          * Alert userspace if needed. If we logged an MCE, reduce the polling
1641          * interval, otherwise increase the polling interval.
1642          */
1643         if (mce_notify_irq())
1644                 iv = max(iv / 2, (unsigned long) HZ/100);
1645         else
1646                 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1647
1648 done:
1649         __this_cpu_write(mce_next_interval, iv);
1650         __start_timer(t, iv);
1651 }
1652
1653 /*
1654  * Ensure that the timer is firing in @interval from now.
1655  */
1656 void mce_timer_kick(unsigned long interval)
1657 {
1658         struct timer_list *t = this_cpu_ptr(&mce_timer);
1659         unsigned long iv = __this_cpu_read(mce_next_interval);
1660
1661         __start_timer(t, interval);
1662
1663         if (interval < iv)
1664                 __this_cpu_write(mce_next_interval, interval);
1665 }
1666
1667 /* Must not be called in IRQ context where del_timer_sync() can deadlock */
1668 static void mce_timer_delete_all(void)
1669 {
1670         int cpu;
1671
1672         for_each_online_cpu(cpu)
1673                 del_timer_sync(&per_cpu(mce_timer, cpu));
1674 }
1675
1676 /*
1677  * Notify the user(s) about new machine check events.
1678  * Can be called from interrupt context, but not from machine check/NMI
1679  * context.
1680  */
1681 int mce_notify_irq(void)
1682 {
1683         /* Not more than two messages every minute */
1684         static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1685
1686         if (test_and_clear_bit(0, &mce_need_notify)) {
1687                 mce_work_trigger();
1688
1689                 if (__ratelimit(&ratelimit))
1690                         pr_info(HW_ERR "Machine check events logged\n");
1691
1692                 return 1;
1693         }
1694         return 0;
1695 }
1696 EXPORT_SYMBOL_GPL(mce_notify_irq);
1697
1698 static void __mcheck_cpu_mce_banks_init(void)
1699 {
1700         struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1701         u8 n_banks = this_cpu_read(mce_num_banks);
1702         int i;
1703
1704         for (i = 0; i < n_banks; i++) {
1705                 struct mce_bank *b = &mce_banks[i];
1706
1707                 /*
1708                  * Init them all, __mcheck_cpu_apply_quirks() is going to apply
1709                  * the required vendor quirks before
1710                  * __mcheck_cpu_init_clear_banks() does the final bank setup.
1711                  */
1712                 b->ctl = -1ULL;
1713                 b->init = true;
1714         }
1715 }
1716
1717 /*
1718  * Initialize Machine Checks for a CPU.
1719  */
1720 static void __mcheck_cpu_cap_init(void)
1721 {
1722         u64 cap;
1723         u8 b;
1724
1725         rdmsrl(MSR_IA32_MCG_CAP, cap);
1726
1727         b = cap & MCG_BANKCNT_MASK;
1728
1729         if (b > MAX_NR_BANKS) {
1730                 pr_warn("CPU%d: Using only %u machine check banks out of %u\n",
1731                         smp_processor_id(), MAX_NR_BANKS, b);
1732                 b = MAX_NR_BANKS;
1733         }
1734
1735         this_cpu_write(mce_num_banks, b);
1736
1737         __mcheck_cpu_mce_banks_init();
1738
1739         /* Use accurate RIP reporting if available. */
1740         if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1741                 mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1742
1743         if (cap & MCG_SER_P)
1744                 mca_cfg.ser = 1;
1745 }
1746
1747 static void __mcheck_cpu_init_generic(void)
1748 {
1749         enum mcp_flags m_fl = 0;
1750         mce_banks_t all_banks;
1751         u64 cap;
1752
1753         if (!mca_cfg.bootlog)
1754                 m_fl = MCP_DONTLOG;
1755
1756         /*
1757          * Log the machine checks left over from the previous reset. Log them
1758          * only, do not start processing them. That will happen in mcheck_late_init()
1759          * when all consumers have been registered on the notifier chain.
1760          */
1761         bitmap_fill(all_banks, MAX_NR_BANKS);
1762         machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks);
1763
1764         cr4_set_bits(X86_CR4_MCE);
1765
1766         rdmsrl(MSR_IA32_MCG_CAP, cap);
1767         if (cap & MCG_CTL_P)
1768                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1769 }
1770
1771 static void __mcheck_cpu_init_clear_banks(void)
1772 {
1773         struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1774         int i;
1775
1776         for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
1777                 struct mce_bank *b = &mce_banks[i];
1778
1779                 if (!b->init)
1780                         continue;
1781                 wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
1782                 wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
1783         }
1784 }
1785
1786 /*
1787  * Do a final check to see if there are any unused/RAZ banks.
1788  *
1789  * This must be done after the banks have been initialized and any quirks have
1790  * been applied.
1791  *
1792  * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs.
1793  * Otherwise, a user who disables a bank will not be able to re-enable it
1794  * without a system reboot.
1795  */
1796 static void __mcheck_cpu_check_banks(void)
1797 {
1798         struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1799         u64 msrval;
1800         int i;
1801
1802         for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
1803                 struct mce_bank *b = &mce_banks[i];
1804
1805                 if (!b->init)
1806                         continue;
1807
1808                 rdmsrl(mca_msr_reg(i, MCA_CTL), msrval);
1809                 b->init = !!msrval;
1810         }
1811 }
1812
1813 /* Add per CPU specific workarounds here */
1814 static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1815 {
1816         struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1817         struct mca_config *cfg = &mca_cfg;
1818
1819         if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1820                 pr_info("unknown CPU type - not enabling MCE support\n");
1821                 return -EOPNOTSUPP;
1822         }
1823
1824         /* This should be disabled by the BIOS, but isn't always */
1825         if (c->x86_vendor == X86_VENDOR_AMD) {
1826                 if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
1827                         /*
1828                          * disable GART TBL walk error reporting, which
1829                          * trips off incorrectly with the IOMMU & 3ware
1830                          * & Cerberus:
1831                          */
1832                         clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1833                 }
1834                 if (c->x86 < 0x11 && cfg->bootlog < 0) {
1835                         /*
1836                          * Lots of broken BIOS around that don't clear them
1837                          * by default and leave crap in there. Don't log:
1838                          */
1839                         cfg->bootlog = 0;
1840                 }
1841                 /*
1842                  * Various K7s with broken bank 0 around. Always disable
1843                  * by default.
1844                  */
1845                 if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0)
1846                         mce_banks[0].ctl = 0;
1847
1848                 /*
1849                  * overflow_recov is supported for F15h Models 00h-0fh
1850                  * even though we don't have a CPUID bit for it.
1851                  */
1852                 if (c->x86 == 0x15 && c->x86_model <= 0xf)
1853                         mce_flags.overflow_recov = 1;
1854
1855         }
1856
1857         if (c->x86_vendor == X86_VENDOR_INTEL) {
1858                 /*
1859                  * SDM documents that on family 6 bank 0 should not be written
1860                  * because it aliases to another special BIOS controlled
1861                  * register.
1862                  * But it's not aliased anymore on model 0x1a+
1863                  * Don't ignore bank 0 completely because there could be a
1864                  * valid event later, merely don't write CTL0.
1865                  */
1866
1867                 if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0)
1868                         mce_banks[0].init = false;
1869
1870                 /*
1871                  * All newer Intel systems support MCE broadcasting. Enable
1872                  * synchronization with a one second timeout.
1873                  */
1874                 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1875                         cfg->monarch_timeout < 0)
1876                         cfg->monarch_timeout = USEC_PER_SEC;
1877
1878                 /*
1879                  * There are also broken BIOSes on some Pentium M and
1880                  * earlier systems:
1881                  */
1882                 if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1883                         cfg->bootlog = 0;
1884
1885                 if (c->x86 == 6 && c->x86_model == 45)
1886                         mce_flags.snb_ifu_quirk = 1;
1887
1888                 /*
1889                  * Skylake, Cascacde Lake and Cooper Lake require a quirk on
1890                  * rep movs.
1891                  */
1892                 if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X)
1893                         mce_flags.skx_repmov_quirk = 1;
1894         }
1895
1896         if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
1897                 /*
1898                  * All newer Zhaoxin CPUs support MCE broadcasting. Enable
1899                  * synchronization with a one second timeout.
1900                  */
1901                 if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
1902                         if (cfg->monarch_timeout < 0)
1903                                 cfg->monarch_timeout = USEC_PER_SEC;
1904                 }
1905         }
1906
1907         if (cfg->monarch_timeout < 0)
1908                 cfg->monarch_timeout = 0;
1909         if (cfg->bootlog != 0)
1910                 cfg->panic_timeout = 30;
1911
1912         return 0;
1913 }
1914
1915 static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1916 {
1917         if (c->x86 != 5)
1918                 return 0;
1919
1920         switch (c->x86_vendor) {
1921         case X86_VENDOR_INTEL:
1922                 intel_p5_mcheck_init(c);
1923                 mce_flags.p5 = 1;
1924                 return 1;
1925         case X86_VENDOR_CENTAUR:
1926                 winchip_mcheck_init(c);
1927                 mce_flags.winchip = 1;
1928                 return 1;
1929         default:
1930                 return 0;
1931         }
1932
1933         return 0;
1934 }
1935
1936 /*
1937  * Init basic CPU features needed for early decoding of MCEs.
1938  */
1939 static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
1940 {
1941         if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) {
1942                 mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
1943                 mce_flags.succor         = !!cpu_has(c, X86_FEATURE_SUCCOR);
1944                 mce_flags.smca           = !!cpu_has(c, X86_FEATURE_SMCA);
1945                 mce_flags.amd_threshold  = 1;
1946         }
1947 }
1948
1949 static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
1950 {
1951         struct mca_config *cfg = &mca_cfg;
1952
1953          /*
1954           * All newer Centaur CPUs support MCE broadcasting. Enable
1955           * synchronization with a one second timeout.
1956           */
1957         if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) ||
1958              c->x86 > 6) {
1959                 if (cfg->monarch_timeout < 0)
1960                         cfg->monarch_timeout = USEC_PER_SEC;
1961         }
1962 }
1963
1964 static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
1965 {
1966         struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1967
1968         /*
1969          * These CPUs have MCA bank 8 which reports only one error type called
1970          * SVAD (System View Address Decoder). The reporting of that error is
1971          * controlled by IA32_MC8.CTL.0.
1972          *
1973          * If enabled, prefetching on these CPUs will cause SVAD MCE when
1974          * virtual machines start and result in a system  panic. Always disable
1975          * bank 8 SVAD error by default.
1976          */
1977         if ((c->x86 == 7 && c->x86_model == 0x1b) ||
1978             (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
1979                 if (this_cpu_read(mce_num_banks) > 8)
1980                         mce_banks[8].ctl = 0;
1981         }
1982
1983         intel_init_cmci();
1984         intel_init_lmce();
1985         mce_adjust_timer = cmci_intel_adjust_timer;
1986 }
1987
1988 static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
1989 {
1990         intel_clear_lmce();
1991 }
1992
1993 static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1994 {
1995         switch (c->x86_vendor) {
1996         case X86_VENDOR_INTEL:
1997                 mce_intel_feature_init(c);
1998                 mce_adjust_timer = cmci_intel_adjust_timer;
1999                 break;
2000
2001         case X86_VENDOR_AMD: {
2002                 mce_amd_feature_init(c);
2003                 break;
2004                 }
2005
2006         case X86_VENDOR_HYGON:
2007                 mce_hygon_feature_init(c);
2008                 break;
2009
2010         case X86_VENDOR_CENTAUR:
2011                 mce_centaur_feature_init(c);
2012                 break;
2013
2014         case X86_VENDOR_ZHAOXIN:
2015                 mce_zhaoxin_feature_init(c);
2016                 break;
2017
2018         default:
2019                 break;
2020         }
2021 }
2022
2023 static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
2024 {
2025         switch (c->x86_vendor) {
2026         case X86_VENDOR_INTEL:
2027                 mce_intel_feature_clear(c);
2028                 break;
2029
2030         case X86_VENDOR_ZHAOXIN:
2031                 mce_zhaoxin_feature_clear(c);
2032                 break;
2033
2034         default:
2035                 break;
2036         }
2037 }
2038
2039 static void mce_start_timer(struct timer_list *t)
2040 {
2041         unsigned long iv = check_interval * HZ;
2042
2043         if (mca_cfg.ignore_ce || !iv)
2044                 return;
2045
2046         this_cpu_write(mce_next_interval, iv);
2047         __start_timer(t, iv);
2048 }
2049
2050 static void __mcheck_cpu_setup_timer(void)
2051 {
2052         struct timer_list *t = this_cpu_ptr(&mce_timer);
2053
2054         timer_setup(t, mce_timer_fn, TIMER_PINNED);
2055 }
2056
2057 static void __mcheck_cpu_init_timer(void)
2058 {
2059         struct timer_list *t = this_cpu_ptr(&mce_timer);
2060
2061         timer_setup(t, mce_timer_fn, TIMER_PINNED);
2062         mce_start_timer(t);
2063 }
2064
2065 bool filter_mce(struct mce *m)
2066 {
2067         if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
2068                 return amd_filter_mce(m);
2069         if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
2070                 return intel_filter_mce(m);
2071
2072         return false;
2073 }
2074
2075 static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
2076 {
2077         irqentry_state_t irq_state;
2078
2079         WARN_ON_ONCE(user_mode(regs));
2080
2081         /*
2082          * Only required when from kernel mode. See
2083          * mce_check_crashing_cpu() for details.
2084          */
2085         if (mca_cfg.initialized && mce_check_crashing_cpu())
2086                 return;
2087
2088         irq_state = irqentry_nmi_enter(regs);
2089
2090         do_machine_check(regs);
2091
2092         irqentry_nmi_exit(regs, irq_state);
2093 }
2094
2095 static __always_inline void exc_machine_check_user(struct pt_regs *regs)
2096 {
2097         irqentry_enter_from_user_mode(regs);
2098
2099         do_machine_check(regs);
2100
2101         irqentry_exit_to_user_mode(regs);
2102 }
2103
2104 #ifdef CONFIG_X86_64
2105 /* MCE hit kernel mode */
2106 DEFINE_IDTENTRY_MCE(exc_machine_check)
2107 {
2108         unsigned long dr7;
2109
2110         dr7 = local_db_save();
2111         exc_machine_check_kernel(regs);
2112         local_db_restore(dr7);
2113 }
2114
2115 /* The user mode variant. */
2116 DEFINE_IDTENTRY_MCE_USER(exc_machine_check)
2117 {
2118         unsigned long dr7;
2119
2120         dr7 = local_db_save();
2121         exc_machine_check_user(regs);
2122         local_db_restore(dr7);
2123 }
2124 #else
2125 /* 32bit unified entry point */
2126 DEFINE_IDTENTRY_RAW(exc_machine_check)
2127 {
2128         unsigned long dr7;
2129
2130         dr7 = local_db_save();
2131         if (user_mode(regs))
2132                 exc_machine_check_user(regs);
2133         else
2134                 exc_machine_check_kernel(regs);
2135         local_db_restore(dr7);
2136 }
2137 #endif
2138
2139 /*
2140  * Called for each booted CPU to set up machine checks.
2141  * Must be called with preempt off:
2142  */
2143 void mcheck_cpu_init(struct cpuinfo_x86 *c)
2144 {
2145         if (mca_cfg.disabled)
2146                 return;
2147
2148         if (__mcheck_cpu_ancient_init(c))
2149                 return;
2150
2151         if (!mce_available(c))
2152                 return;
2153
2154         __mcheck_cpu_cap_init();
2155
2156         if (__mcheck_cpu_apply_quirks(c) < 0) {
2157                 mca_cfg.disabled = 1;
2158                 return;
2159         }
2160
2161         if (mce_gen_pool_init()) {
2162                 mca_cfg.disabled = 1;
2163                 pr_emerg("Couldn't allocate MCE records pool!\n");
2164                 return;
2165         }
2166
2167         mca_cfg.initialized = 1;
2168
2169         __mcheck_cpu_init_early(c);
2170         __mcheck_cpu_init_generic();
2171         __mcheck_cpu_init_vendor(c);
2172         __mcheck_cpu_init_clear_banks();
2173         __mcheck_cpu_check_banks();
2174         __mcheck_cpu_setup_timer();
2175 }
2176
2177 /*
2178  * Called for each booted CPU to clear some machine checks opt-ins
2179  */
2180 void mcheck_cpu_clear(struct cpuinfo_x86 *c)
2181 {
2182         if (mca_cfg.disabled)
2183                 return;
2184
2185         if (!mce_available(c))
2186                 return;
2187
2188         /*
2189          * Possibly to clear general settings generic to x86
2190          * __mcheck_cpu_clear_generic(c);
2191          */
2192         __mcheck_cpu_clear_vendor(c);
2193
2194 }
2195
2196 static void __mce_disable_bank(void *arg)
2197 {
2198         int bank = *((int *)arg);
2199         __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
2200         cmci_disable_bank(bank);
2201 }
2202
2203 void mce_disable_bank(int bank)
2204 {
2205         if (bank >= this_cpu_read(mce_num_banks)) {
2206                 pr_warn(FW_BUG
2207                         "Ignoring request to disable invalid MCA bank %d.\n",
2208                         bank);
2209                 return;
2210         }
2211         set_bit(bank, mce_banks_ce_disabled);
2212         on_each_cpu(__mce_disable_bank, &bank, 1);
2213 }
2214
2215 /*
2216  * mce=off Disables machine check
2217  * mce=no_cmci Disables CMCI
2218  * mce=no_lmce Disables LMCE
2219  * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
2220  * mce=print_all Print all machine check logs to console
2221  * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
2222  * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
2223  *      monarchtimeout is how long to wait for other CPUs on machine
2224  *      check, or 0 to not wait
2225  * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h
2226         and older.
2227  * mce=nobootlog Don't log MCEs from before booting.
2228  * mce=bios_cmci_threshold Don't program the CMCI threshold
2229  * mce=recovery force enable copy_mc_fragile()
2230  */
2231 static int __init mcheck_enable(char *str)
2232 {
2233         struct mca_config *cfg = &mca_cfg;
2234
2235         if (*str == 0) {
2236                 enable_p5_mce();
2237                 return 1;
2238         }
2239         if (*str == '=')
2240                 str++;
2241         if (!strcmp(str, "off"))
2242                 cfg->disabled = 1;
2243         else if (!strcmp(str, "no_cmci"))
2244                 cfg->cmci_disabled = true;
2245         else if (!strcmp(str, "no_lmce"))
2246                 cfg->lmce_disabled = 1;
2247         else if (!strcmp(str, "dont_log_ce"))
2248                 cfg->dont_log_ce = true;
2249         else if (!strcmp(str, "print_all"))
2250                 cfg->print_all = true;
2251         else if (!strcmp(str, "ignore_ce"))
2252                 cfg->ignore_ce = true;
2253         else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
2254                 cfg->bootlog = (str[0] == 'b');
2255         else if (!strcmp(str, "bios_cmci_threshold"))
2256                 cfg->bios_cmci_threshold = 1;
2257         else if (!strcmp(str, "recovery"))
2258                 cfg->recovery = 1;
2259         else if (isdigit(str[0]))
2260                 get_option(&str, &(cfg->monarch_timeout));
2261         else {
2262                 pr_info("mce argument %s ignored. Please use /sys\n", str);
2263                 return 0;
2264         }
2265         return 1;
2266 }
2267 __setup("mce", mcheck_enable);
2268
2269 int __init mcheck_init(void)
2270 {
2271         mce_register_decode_chain(&early_nb);
2272         mce_register_decode_chain(&mce_uc_nb);
2273         mce_register_decode_chain(&mce_default_nb);
2274
2275         INIT_WORK(&mce_work, mce_gen_pool_process);
2276         init_irq_work(&mce_irq_work, mce_irq_work_cb);
2277
2278         return 0;
2279 }
2280
2281 /*
2282  * mce_syscore: PM support
2283  */
2284
2285 /*
2286  * Disable machine checks on suspend and shutdown. We can't really handle
2287  * them later.
2288  */
2289 static void mce_disable_error_reporting(void)
2290 {
2291         struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
2292         int i;
2293
2294         for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
2295                 struct mce_bank *b = &mce_banks[i];
2296
2297                 if (b->init)
2298                         wrmsrl(mca_msr_reg(i, MCA_CTL), 0);
2299         }
2300         return;
2301 }
2302
2303 static void vendor_disable_error_reporting(void)
2304 {
2305         /*
2306          * Don't clear on Intel or AMD or Hygon or Zhaoxin CPUs. Some of these
2307          * MSRs are socket-wide. Disabling them for just a single offlined CPU
2308          * is bad, since it will inhibit reporting for all shared resources on
2309          * the socket like the last level cache (LLC), the integrated memory
2310          * controller (iMC), etc.
2311          */
2312         if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
2313             boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ||
2314             boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
2315             boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN)
2316                 return;
2317
2318         mce_disable_error_reporting();
2319 }
2320
2321 static int mce_syscore_suspend(void)
2322 {
2323         vendor_disable_error_reporting();
2324         return 0;
2325 }
2326
2327 static void mce_syscore_shutdown(void)
2328 {
2329         vendor_disable_error_reporting();
2330 }
2331
2332 /*
2333  * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
2334  * Only one CPU is active at this time, the others get re-added later using
2335  * CPU hotplug:
2336  */
2337 static void mce_syscore_resume(void)
2338 {
2339         __mcheck_cpu_init_generic();
2340         __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
2341         __mcheck_cpu_init_clear_banks();
2342 }
2343
2344 static struct syscore_ops mce_syscore_ops = {
2345         .suspend        = mce_syscore_suspend,
2346         .shutdown       = mce_syscore_shutdown,
2347         .resume         = mce_syscore_resume,
2348 };
2349
2350 /*
2351  * mce_device: Sysfs support
2352  */
2353
2354 static void mce_cpu_restart(void *data)
2355 {
2356         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2357                 return;
2358         __mcheck_cpu_init_generic();
2359         __mcheck_cpu_init_clear_banks();
2360         __mcheck_cpu_init_timer();
2361 }
2362
2363 /* Reinit MCEs after user configuration changes */
2364 static void mce_restart(void)
2365 {
2366         mce_timer_delete_all();
2367         on_each_cpu(mce_cpu_restart, NULL, 1);
2368 }
2369
2370 /* Toggle features for corrected errors */
2371 static void mce_disable_cmci(void *data)
2372 {
2373         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2374                 return;
2375         cmci_clear();
2376 }
2377
2378 static void mce_enable_ce(void *all)
2379 {
2380         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2381                 return;
2382         cmci_reenable();
2383         cmci_recheck();
2384         if (all)
2385                 __mcheck_cpu_init_timer();
2386 }
2387
2388 static struct bus_type mce_subsys = {
2389         .name           = "machinecheck",
2390         .dev_name       = "machinecheck",
2391 };
2392
2393 DEFINE_PER_CPU(struct device *, mce_device);
2394
2395 static inline struct mce_bank_dev *attr_to_bank(struct device_attribute *attr)
2396 {
2397         return container_of(attr, struct mce_bank_dev, attr);
2398 }
2399
2400 static ssize_t show_bank(struct device *s, struct device_attribute *attr,
2401                          char *buf)
2402 {
2403         u8 bank = attr_to_bank(attr)->bank;
2404         struct mce_bank *b;
2405
2406         if (bank >= per_cpu(mce_num_banks, s->id))
2407                 return -EINVAL;
2408
2409         b = &per_cpu(mce_banks_array, s->id)[bank];
2410
2411         if (!b->init)
2412                 return -ENODEV;
2413
2414         return sprintf(buf, "%llx\n", b->ctl);
2415 }
2416
2417 static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2418                         const char *buf, size_t size)
2419 {
2420         u8 bank = attr_to_bank(attr)->bank;
2421         struct mce_bank *b;
2422         u64 new;
2423
2424         if (kstrtou64(buf, 0, &new) < 0)
2425                 return -EINVAL;
2426
2427         if (bank >= per_cpu(mce_num_banks, s->id))
2428                 return -EINVAL;
2429
2430         b = &per_cpu(mce_banks_array, s->id)[bank];
2431
2432         if (!b->init)
2433                 return -ENODEV;
2434
2435         b->ctl = new;
2436         mce_restart();
2437
2438         return size;
2439 }
2440
2441 static ssize_t set_ignore_ce(struct device *s,
2442                              struct device_attribute *attr,
2443                              const char *buf, size_t size)
2444 {
2445         u64 new;
2446
2447         if (kstrtou64(buf, 0, &new) < 0)
2448                 return -EINVAL;
2449
2450         mutex_lock(&mce_sysfs_mutex);
2451         if (mca_cfg.ignore_ce ^ !!new) {
2452                 if (new) {
2453                         /* disable ce features */
2454                         mce_timer_delete_all();
2455                         on_each_cpu(mce_disable_cmci, NULL, 1);
2456                         mca_cfg.ignore_ce = true;
2457                 } else {
2458                         /* enable ce features */
2459                         mca_cfg.ignore_ce = false;
2460                         on_each_cpu(mce_enable_ce, (void *)1, 1);
2461                 }
2462         }
2463         mutex_unlock(&mce_sysfs_mutex);
2464
2465         return size;
2466 }
2467
2468 static ssize_t set_cmci_disabled(struct device *s,
2469                                  struct device_attribute *attr,
2470                                  const char *buf, size_t size)
2471 {
2472         u64 new;
2473
2474         if (kstrtou64(buf, 0, &new) < 0)
2475                 return -EINVAL;
2476
2477         mutex_lock(&mce_sysfs_mutex);
2478         if (mca_cfg.cmci_disabled ^ !!new) {
2479                 if (new) {
2480                         /* disable cmci */
2481                         on_each_cpu(mce_disable_cmci, NULL, 1);
2482                         mca_cfg.cmci_disabled = true;
2483                 } else {
2484                         /* enable cmci */
2485                         mca_cfg.cmci_disabled = false;
2486                         on_each_cpu(mce_enable_ce, NULL, 1);
2487                 }
2488         }
2489         mutex_unlock(&mce_sysfs_mutex);
2490
2491         return size;
2492 }
2493
2494 static ssize_t store_int_with_restart(struct device *s,
2495                                       struct device_attribute *attr,
2496                                       const char *buf, size_t size)
2497 {
2498         unsigned long old_check_interval = check_interval;
2499         ssize_t ret = device_store_ulong(s, attr, buf, size);
2500
2501         if (check_interval == old_check_interval)
2502                 return ret;
2503
2504         mutex_lock(&mce_sysfs_mutex);
2505         mce_restart();
2506         mutex_unlock(&mce_sysfs_mutex);
2507
2508         return ret;
2509 }
2510
2511 static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
2512 static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
2513 static DEVICE_BOOL_ATTR(print_all, 0644, mca_cfg.print_all);
2514
2515 static struct dev_ext_attribute dev_attr_check_interval = {
2516         __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
2517         &check_interval
2518 };
2519
2520 static struct dev_ext_attribute dev_attr_ignore_ce = {
2521         __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2522         &mca_cfg.ignore_ce
2523 };
2524
2525 static struct dev_ext_attribute dev_attr_cmci_disabled = {
2526         __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2527         &mca_cfg.cmci_disabled
2528 };
2529
2530 static struct device_attribute *mce_device_attrs[] = {
2531         &dev_attr_check_interval.attr,
2532 #ifdef CONFIG_X86_MCELOG_LEGACY
2533         &dev_attr_trigger,
2534 #endif
2535         &dev_attr_monarch_timeout.attr,
2536         &dev_attr_dont_log_ce.attr,
2537         &dev_attr_print_all.attr,
2538         &dev_attr_ignore_ce.attr,
2539         &dev_attr_cmci_disabled.attr,
2540         NULL
2541 };
2542
2543 static cpumask_var_t mce_device_initialized;
2544
2545 static void mce_device_release(struct device *dev)
2546 {
2547         kfree(dev);
2548 }
2549
2550 /* Per CPU device init. All of the CPUs still share the same bank device: */
2551 static int mce_device_create(unsigned int cpu)
2552 {
2553         struct device *dev;
2554         int err;
2555         int i, j;
2556
2557         if (!mce_available(&boot_cpu_data))
2558                 return -EIO;
2559
2560         dev = per_cpu(mce_device, cpu);
2561         if (dev)
2562                 return 0;
2563
2564         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
2565         if (!dev)
2566                 return -ENOMEM;
2567         dev->id  = cpu;
2568         dev->bus = &mce_subsys;
2569         dev->release = &mce_device_release;
2570
2571         err = device_register(dev);
2572         if (err) {
2573                 put_device(dev);
2574                 return err;
2575         }
2576
2577         for (i = 0; mce_device_attrs[i]; i++) {
2578                 err = device_create_file(dev, mce_device_attrs[i]);
2579                 if (err)
2580                         goto error;
2581         }
2582         for (j = 0; j < per_cpu(mce_num_banks, cpu); j++) {
2583                 err = device_create_file(dev, &mce_bank_devs[j].attr);
2584                 if (err)
2585                         goto error2;
2586         }
2587         cpumask_set_cpu(cpu, mce_device_initialized);
2588         per_cpu(mce_device, cpu) = dev;
2589
2590         return 0;
2591 error2:
2592         while (--j >= 0)
2593                 device_remove_file(dev, &mce_bank_devs[j].attr);
2594 error:
2595         while (--i >= 0)
2596                 device_remove_file(dev, mce_device_attrs[i]);
2597
2598         device_unregister(dev);
2599
2600         return err;
2601 }
2602
2603 static void mce_device_remove(unsigned int cpu)
2604 {
2605         struct device *dev = per_cpu(mce_device, cpu);
2606         int i;
2607
2608         if (!cpumask_test_cpu(cpu, mce_device_initialized))
2609                 return;
2610
2611         for (i = 0; mce_device_attrs[i]; i++)
2612                 device_remove_file(dev, mce_device_attrs[i]);
2613
2614         for (i = 0; i < per_cpu(mce_num_banks, cpu); i++)
2615                 device_remove_file(dev, &mce_bank_devs[i].attr);
2616
2617         device_unregister(dev);
2618         cpumask_clear_cpu(cpu, mce_device_initialized);
2619         per_cpu(mce_device, cpu) = NULL;
2620 }
2621
2622 /* Make sure there are no machine checks on offlined CPUs. */
2623 static void mce_disable_cpu(void)
2624 {
2625         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2626                 return;
2627
2628         if (!cpuhp_tasks_frozen)
2629                 cmci_clear();
2630
2631         vendor_disable_error_reporting();
2632 }
2633
2634 static void mce_reenable_cpu(void)
2635 {
2636         struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
2637         int i;
2638
2639         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2640                 return;
2641
2642         if (!cpuhp_tasks_frozen)
2643                 cmci_reenable();
2644         for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
2645                 struct mce_bank *b = &mce_banks[i];
2646
2647                 if (b->init)
2648                         wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
2649         }
2650 }
2651
2652 static int mce_cpu_dead(unsigned int cpu)
2653 {
2654         mce_intel_hcpu_update(cpu);
2655
2656         /* intentionally ignoring frozen here */
2657         if (!cpuhp_tasks_frozen)
2658                 cmci_rediscover();
2659         return 0;
2660 }
2661
2662 static int mce_cpu_online(unsigned int cpu)
2663 {
2664         struct timer_list *t = this_cpu_ptr(&mce_timer);
2665         int ret;
2666
2667         mce_device_create(cpu);
2668
2669         ret = mce_threshold_create_device(cpu);
2670         if (ret) {
2671                 mce_device_remove(cpu);
2672                 return ret;
2673         }
2674         mce_reenable_cpu();
2675         mce_start_timer(t);
2676         return 0;
2677 }
2678
2679 static int mce_cpu_pre_down(unsigned int cpu)
2680 {
2681         struct timer_list *t = this_cpu_ptr(&mce_timer);
2682
2683         mce_disable_cpu();
2684         del_timer_sync(t);
2685         mce_threshold_remove_device(cpu);
2686         mce_device_remove(cpu);
2687         return 0;
2688 }
2689
2690 static __init void mce_init_banks(void)
2691 {
2692         int i;
2693
2694         for (i = 0; i < MAX_NR_BANKS; i++) {
2695                 struct mce_bank_dev *b = &mce_bank_devs[i];
2696                 struct device_attribute *a = &b->attr;
2697
2698                 b->bank = i;
2699
2700                 sysfs_attr_init(&a->attr);
2701                 a->attr.name    = b->attrname;
2702                 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2703
2704                 a->attr.mode    = 0644;
2705                 a->show         = show_bank;
2706                 a->store        = set_bank;
2707         }
2708 }
2709
2710 /*
2711  * When running on XEN, this initcall is ordered against the XEN mcelog
2712  * initcall:
2713  *
2714  *   device_initcall(xen_late_init_mcelog);
2715  *   device_initcall_sync(mcheck_init_device);
2716  */
2717 static __init int mcheck_init_device(void)
2718 {
2719         int err;
2720
2721         /*
2722          * Check if we have a spare virtual bit. This will only become
2723          * a problem if/when we move beyond 5-level page tables.
2724          */
2725         MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63);
2726
2727         if (!mce_available(&boot_cpu_data)) {
2728                 err = -EIO;
2729                 goto err_out;
2730         }
2731
2732         if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2733                 err = -ENOMEM;
2734                 goto err_out;
2735         }
2736
2737         mce_init_banks();
2738
2739         err = subsys_system_register(&mce_subsys, NULL);
2740         if (err)
2741                 goto err_out_mem;
2742
2743         err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
2744                                 mce_cpu_dead);
2745         if (err)
2746                 goto err_out_mem;
2747
2748         /*
2749          * Invokes mce_cpu_online() on all CPUs which are online when
2750          * the state is installed.
2751          */
2752         err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
2753                                 mce_cpu_online, mce_cpu_pre_down);
2754         if (err < 0)
2755                 goto err_out_online;
2756
2757         register_syscore_ops(&mce_syscore_ops);
2758
2759         return 0;
2760
2761 err_out_online:
2762         cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
2763
2764 err_out_mem:
2765         free_cpumask_var(mce_device_initialized);
2766
2767 err_out:
2768         pr_err("Unable to init MCE device (rc: %d)\n", err);
2769
2770         return err;
2771 }
2772 device_initcall_sync(mcheck_init_device);
2773
2774 /*
2775  * Old style boot options parsing. Only for compatibility.
2776  */
2777 static int __init mcheck_disable(char *str)
2778 {
2779         mca_cfg.disabled = 1;
2780         return 1;
2781 }
2782 __setup("nomce", mcheck_disable);
2783
2784 #ifdef CONFIG_DEBUG_FS
2785 struct dentry *mce_get_debugfs_dir(void)
2786 {
2787         static struct dentry *dmce;
2788
2789         if (!dmce)
2790                 dmce = debugfs_create_dir("mce", NULL);
2791
2792         return dmce;
2793 }
2794
2795 static void mce_reset(void)
2796 {
2797         atomic_set(&mce_fake_panicked, 0);
2798         atomic_set(&mce_executing, 0);
2799         atomic_set(&mce_callin, 0);
2800         atomic_set(&global_nwo, 0);
2801         cpumask_setall(&mce_missing_cpus);
2802 }
2803
2804 static int fake_panic_get(void *data, u64 *val)
2805 {
2806         *val = fake_panic;
2807         return 0;
2808 }
2809
2810 static int fake_panic_set(void *data, u64 val)
2811 {
2812         mce_reset();
2813         fake_panic = val;
2814         return 0;
2815 }
2816
2817 DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set,
2818                          "%llu\n");
2819
2820 static void __init mcheck_debugfs_init(void)
2821 {
2822         struct dentry *dmce;
2823
2824         dmce = mce_get_debugfs_dir();
2825         debugfs_create_file_unsafe("fake_panic", 0444, dmce, NULL,
2826                                    &fake_panic_fops);
2827 }
2828 #else
2829 static void __init mcheck_debugfs_init(void) { }
2830 #endif
2831
2832 static int __init mcheck_late_init(void)
2833 {
2834         if (mca_cfg.recovery)
2835                 enable_copy_mc_fragile();
2836
2837         mcheck_debugfs_init();
2838
2839         /*
2840          * Flush out everything that has been logged during early boot, now that
2841          * everything has been initialized (workqueues, decoders, ...).
2842          */
2843         mce_schedule_work();
2844
2845         return 0;
2846 }
2847 late_initcall(mcheck_late_init);