arch/x86/events/core.c

   1 /*
   2  * Performance events x86 architecture code
   3  *
   4  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5  *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
   6  *  Copyright (C) 2009 Jaswinder Singh Rajput
   7  *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
   8  *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
   9  *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
  10  *  Copyright (C) 2009 Google, Inc., Stephane Eranian
  11  *
  12  *  For licencing details see kernel-base/COPYING
  13  */
  14
  15 #include <linux/perf_event.h>
  16 #include <linux/capability.h>
  17 #include <linux/notifier.h>
  18 #include <linux/hardirq.h>
  19 #include <linux/kprobes.h>
  20 #include <linux/export.h>
  21 #include <linux/init.h>
  22 #include <linux/kdebug.h>
  23 #include <linux/sched/mm.h>
  24 #include <linux/sched/clock.h>
  25 #include <linux/uaccess.h>
  26 #include <linux/slab.h>
  27 #include <linux/cpu.h>
  28 #include <linux/bitops.h>
  29 #include <linux/device.h>
  30 #include <linux/nospec.h>
  31 #include <linux/static_call.h>
  32
  33 #include <asm/apic.h>
  34 #include <asm/stacktrace.h>
  35 #include <asm/nmi.h>
  36 #include <asm/smp.h>
  37 #include <asm/alternative.h>
  38 #include <asm/mmu_context.h>
  39 #include <asm/tlbflush.h>
  40 #include <asm/timer.h>
  41 #include <asm/desc.h>
  42 #include <asm/ldt.h>
  43 #include <asm/unwind.h>
  44
  45 #include "perf_event.h"
  46
  47 struct x86_pmu x86_pmu __read_mostly;
  48 static struct pmu pmu;
  49
  50 DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
  51         .enabled = 1,
  52         .pmu = &pmu,
  53 };
  54
  55 DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key);
  56 DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key);
  57
  58 /*
  59  * This here uses DEFINE_STATIC_CALL_NULL() to get a static_call defined
  60  * from just a typename, as opposed to an actual function.
  61  */
  62 DEFINE_STATIC_CALL_NULL(x86_pmu_handle_irq,  *x86_pmu.handle_irq);
  63 DEFINE_STATIC_CALL_NULL(x86_pmu_disable_all, *x86_pmu.disable_all);
  64 DEFINE_STATIC_CALL_NULL(x86_pmu_enable_all,  *x86_pmu.enable_all);
  65 DEFINE_STATIC_CALL_NULL(x86_pmu_enable,      *x86_pmu.enable);
  66 DEFINE_STATIC_CALL_NULL(x86_pmu_disable,     *x86_pmu.disable);
  67
  68 DEFINE_STATIC_CALL_NULL(x86_pmu_add,  *x86_pmu.add);
  69 DEFINE_STATIC_CALL_NULL(x86_pmu_del,  *x86_pmu.del);
  70 DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read);
  71
  72 DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events,       *x86_pmu.schedule_events);
  73 DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints);
  74 DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints);
  75
  76 DEFINE_STATIC_CALL_NULL(x86_pmu_start_scheduling,  *x86_pmu.start_scheduling);
  77 DEFINE_STATIC_CALL_NULL(x86_pmu_commit_scheduling, *x86_pmu.commit_scheduling);
  78 DEFINE_STATIC_CALL_NULL(x86_pmu_stop_scheduling,   *x86_pmu.stop_scheduling);
  79
  80 DEFINE_STATIC_CALL_NULL(x86_pmu_sched_task,    *x86_pmu.sched_task);
  81 DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx);
  82
  83 DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs,   *x86_pmu.drain_pebs);
  84 DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases);
  85
  86 u64 __read_mostly hw_cache_event_ids
  87                                 [PERF_COUNT_HW_CACHE_MAX]
  88                                 [PERF_COUNT_HW_CACHE_OP_MAX]
  89                                 [PERF_COUNT_HW_CACHE_RESULT_MAX];
  90 u64 __read_mostly hw_cache_extra_regs
  91                                 [PERF_COUNT_HW_CACHE_MAX]
  92                                 [PERF_COUNT_HW_CACHE_OP_MAX]
  93                                 [PERF_COUNT_HW_CACHE_RESULT_MAX];
  94
  95 /*
  96  * Propagate event elapsed time into the generic event.
  97  * Can only be executed on the CPU where the event is active.
  98  * Returns the delta events processed.
  99  */
 100 u64 x86_perf_event_update(struct perf_event *event)
 101 {
 102         struct hw_perf_event *hwc = &event->hw;
 103         int shift = 64 - x86_pmu.cntval_bits;
 104         u64 prev_raw_count, new_raw_count;
 105         u64 delta;
 106
 107         if (unlikely(!hwc->event_base))
 108                 return 0;
 109
 110         if (unlikely(is_topdown_count(event)) && x86_pmu.update_topdown_event)
 111                 return x86_pmu.update_topdown_event(event);
 112
 113         /*
 114          * Careful: an NMI might modify the previous event value.
 115          *
 116          * Our tactic to handle this is to first atomically read and
 117          * exchange a new raw count - then add that new-prev delta
 118          * count to the generic event atomically:
 119          */
 120 again:
 121         prev_raw_count = local64_read(&hwc->prev_count);
 122         rdpmcl(hwc->event_base_rdpmc, new_raw_count);
 123
 124         if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 125                                         new_raw_count) != prev_raw_count)
 126                 goto again;
 127
 128         /*
 129          * Now we have the new raw value and have updated the prev
 130          * timestamp already. We can now calculate the elapsed delta
 131          * (event-)time and add that to the generic event.
 132          *
 133          * Careful, not all hw sign-extends above the physical width
 134          * of the count.
 135          */
 136         delta = (new_raw_count << shift) - (prev_raw_count << shift);
 137         delta >>= shift;
 138
 139         local64_add(delta, &event->count);
 140         local64_sub(delta, &hwc->period_left);
 141
 142         return new_raw_count;
 143 }
 144
 145 /*
 146  * Find and validate any extra registers to set up.
 147  */
 148 static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
 149 {
 150         struct hw_perf_event_extra *reg;
 151         struct extra_reg *er;
 152
 153         reg = &event->hw.extra_reg;
 154
 155         if (!x86_pmu.extra_regs)
 156                 return 0;
 157
 158         for (er = x86_pmu.extra_regs; er->msr; er++) {
 159                 if (er->event != (config & er->config_mask))
 160                         continue;
 161                 if (event->attr.config1 & ~er->valid_mask)
 162                         return -EINVAL;
 163                 /* Check if the extra msrs can be safely accessed*/
 164                 if (!er->extra_msr_access)
 165                         return -ENXIO;
 166
 167                 reg->idx = er->idx;
 168                 reg->config = event->attr.config1;
 169                 reg->reg = er->msr;
 170                 break;
 171         }
 172         return 0;
 173 }
 174
 175 static atomic_t active_events;
 176 static atomic_t pmc_refcount;
 177 static DEFINE_MUTEX(pmc_reserve_mutex);
 178
 179 #ifdef CONFIG_X86_LOCAL_APIC
 180
 181 static bool reserve_pmc_hardware(void)
 182 {
 183         int i;
 184
 185         for (i = 0; i < x86_pmu.num_counters; i++) {
 186                 if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
 187                         goto perfctr_fail;
 188         }
 189
 190         for (i = 0; i < x86_pmu.num_counters; i++) {
 191                 if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
 192                         goto eventsel_fail;
 193         }
 194
 195         return true;
 196
 197 eventsel_fail:
 198         for (i--; i >= 0; i--)
 199                 release_evntsel_nmi(x86_pmu_config_addr(i));
 200
 201         i = x86_pmu.num_counters;
 202
 203 perfctr_fail:
 204         for (i--; i >= 0; i--)
 205                 release_perfctr_nmi(x86_pmu_event_addr(i));
 206
 207         return false;
 208 }
 209
 210 static void release_pmc_hardware(void)
 211 {
 212         int i;
 213
 214         for (i = 0; i < x86_pmu.num_counters; i++) {
 215                 release_perfctr_nmi(x86_pmu_event_addr(i));
 216                 release_evntsel_nmi(x86_pmu_config_addr(i));
 217         }
 218 }
 219
 220 #else
 221
 222 static bool reserve_pmc_hardware(void) { return true; }
 223 static void release_pmc_hardware(void) {}
 224
 225 #endif
 226
 227 static bool check_hw_exists(void)
 228 {
 229         u64 val, val_fail = -1, val_new= ~0;
 230         int i, reg, reg_fail = -1, ret = 0;
 231         int bios_fail = 0;
 232         int reg_safe = -1;
 233
 234         /*
 235          * Check to see if the BIOS enabled any of the counters, if so
 236          * complain and bail.
 237          */
 238         for (i = 0; i < x86_pmu.num_counters; i++) {
 239                 reg = x86_pmu_config_addr(i);
 240                 ret = rdmsrl_safe(reg, &val);
 241                 if (ret)
 242                         goto msr_fail;
 243                 if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
 244                         bios_fail = 1;
 245                         val_fail = val;
 246                         reg_fail = reg;
 247                 } else {
 248                         reg_safe = i;
 249                 }
 250         }
 251
 252         if (x86_pmu.num_counters_fixed) {
 253                 reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
 254                 ret = rdmsrl_safe(reg, &val);
 255                 if (ret)
 256                         goto msr_fail;
 257                 for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
 258                         if (val & (0x03 << i*4)) {
 259                                 bios_fail = 1;
 260                                 val_fail = val;
 261                                 reg_fail = reg;
 262                         }
 263                 }
 264         }
 265
 266         /*
 267          * If all the counters are enabled, the below test will always
 268          * fail.  The tools will also become useless in this scenario.
 269          * Just fail and disable the hardware counters.
 270          */
 271
 272         if (reg_safe == -1) {
 273                 reg = reg_safe;
 274                 goto msr_fail;
 275         }
 276
 277         /*
 278          * Read the current value, change it and read it back to see if it
 279          * matches, this is needed to detect certain hardware emulators
 280          * (qemu/kvm) that don't trap on the MSR access and always return 0s.
 281          */
 282         reg = x86_pmu_event_addr(reg_safe);
 283         if (rdmsrl_safe(reg, &val))
 284                 goto msr_fail;
 285         val ^= 0xffffUL;
 286         ret = wrmsrl_safe(reg, val);
 287         ret |= rdmsrl_safe(reg, &val_new);
 288         if (ret || val != val_new)
 289                 goto msr_fail;
 290
 291         /*
 292          * We still allow the PMU driver to operate:
 293          */
 294         if (bios_fail) {
 295                 pr_cont("Broken BIOS detected, complain to your hardware vendor.\n");
 296                 pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",
 297                               reg_fail, val_fail);
 298         }
 299
 300         return true;
 301
 302 msr_fail:
 303         if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
 304                 pr_cont("PMU not available due to virtualization, using software events only.\n");
 305         } else {
 306                 pr_cont("Broken PMU hardware detected, using software events only.\n");
 307                 pr_err("Failed to access perfctr msr (MSR %x is %Lx)\n",
 308                        reg, val_new);
 309         }
 310
 311         return false;
 312 }
 313
 314 static void hw_perf_event_destroy(struct perf_event *event)
 315 {
 316         x86_release_hardware();
 317         atomic_dec(&active_events);
 318 }
 319
 320 void hw_perf_lbr_event_destroy(struct perf_event *event)
 321 {
 322         hw_perf_event_destroy(event);
 323
 324         /* undo the lbr/bts event accounting */
 325         x86_del_exclusive(x86_lbr_exclusive_lbr);
 326 }
 327
 328 static inline int x86_pmu_initialized(void)
 329 {
 330         return x86_pmu.handle_irq != NULL;
 331 }
 332
 333 static inline int
 334 set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
 335 {
 336         struct perf_event_attr *attr = &event->attr;
 337         unsigned int cache_type, cache_op, cache_result;
 338         u64 config, val;
 339
 340         config = attr->config;
 341
 342         cache_type = (config >> 0) & 0xff;
 343         if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
 344                 return -EINVAL;
 345         cache_type = array_index_nospec(cache_type, PERF_COUNT_HW_CACHE_MAX);
 346
 347         cache_op = (config >>  8) & 0xff;
 348         if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
 349                 return -EINVAL;
 350         cache_op = array_index_nospec(cache_op, PERF_COUNT_HW_CACHE_OP_MAX);
 351
 352         cache_result = (config >> 16) & 0xff;
 353         if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
 354                 return -EINVAL;
 355         cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX);
 356
 357         val = hw_cache_event_ids[cache_type][cache_op][cache_result];
 358
 359         if (val == 0)
 360                 return -ENOENT;
 361
 362         if (val == -1)
 363                 return -EINVAL;
 364
 365         hwc->config |= val;
 366         attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
 367         return x86_pmu_extra_regs(val, event);
 368 }
 369
 370 int x86_reserve_hardware(void)
 371 {
 372         int err = 0;
 373
 374         if (!atomic_inc_not_zero(&pmc_refcount)) {
 375                 mutex_lock(&pmc_reserve_mutex);
 376                 if (atomic_read(&pmc_refcount) == 0) {
 377                         if (!reserve_pmc_hardware()) {
 378                                 err = -EBUSY;
 379                         } else {
 380                                 reserve_ds_buffers();
 381                                 reserve_lbr_buffers();
 382                         }
 383                 }
 384                 if (!err)
 385                         atomic_inc(&pmc_refcount);
 386                 mutex_unlock(&pmc_reserve_mutex);
 387         }
 388
 389         return err;
 390 }
 391
 392 void x86_release_hardware(void)
 393 {
 394         if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
 395                 release_pmc_hardware();
 396                 release_ds_buffers();
 397                 release_lbr_buffers();
 398                 mutex_unlock(&pmc_reserve_mutex);
 399         }
 400 }
 401
 402 /*
 403  * Check if we can create event of a certain type (that no conflicting events
 404  * are present).
 405  */
 406 int x86_add_exclusive(unsigned int what)
 407 {
 408         int i;
 409
 410         /*
 411          * When lbr_pt_coexist we allow PT to coexist with either LBR or BTS.
 412          * LBR and BTS are still mutually exclusive.
 413          */
 414         if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt)
 415                 goto out;
 416
 417         if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
 418                 mutex_lock(&pmc_reserve_mutex);
 419                 for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
 420                         if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
 421                                 goto fail_unlock;
 422                 }
 423                 atomic_inc(&x86_pmu.lbr_exclusive[what]);
 424                 mutex_unlock(&pmc_reserve_mutex);
 425         }
 426
 427 out:
 428         atomic_inc(&active_events);
 429         return 0;
 430
 431 fail_unlock:
 432         mutex_unlock(&pmc_reserve_mutex);
 433         return -EBUSY;
 434 }
 435
 436 void x86_del_exclusive(unsigned int what)
 437 {
 438         atomic_dec(&active_events);
 439
 440         /*
 441          * See the comment in x86_add_exclusive().
 442          */
 443         if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt)
 444                 return;
 445
 446         atomic_dec(&x86_pmu.lbr_exclusive[what]);
 447 }
 448
 449 int x86_setup_perfctr(struct perf_event *event)
 450 {
 451         struct perf_event_attr *attr = &event->attr;
 452         struct hw_perf_event *hwc = &event->hw;
 453         u64 config;
 454
 455         if (!is_sampling_event(event)) {
 456                 hwc->sample_period = x86_pmu.max_period;
 457                 hwc->last_period = hwc->sample_period;
 458                 local64_set(&hwc->period_left, hwc->sample_period);
 459         }
 460
 461         if (attr->type == PERF_TYPE_RAW)
 462                 return x86_pmu_extra_regs(event->attr.config, event);
 463
 464         if (attr->type == PERF_TYPE_HW_CACHE)
 465                 return set_ext_hw_attr(hwc, event);
 466
 467         if (attr->config >= x86_pmu.max_events)
 468                 return -EINVAL;
 469
 470         attr->config = array_index_nospec((unsigned long)attr->config, x86_pmu.max_events);
 471
 472         /*
 473          * The generic map:
 474          */
 475         config = x86_pmu.event_map(attr->config);
 476
 477         if (config == 0)
 478                 return -ENOENT;
 479
 480         if (config == -1LL)
 481                 return -EINVAL;
 482
 483         hwc->config |= config;
 484
 485         return 0;
 486 }
 487
 488 /*
 489  * check that branch_sample_type is compatible with
 490  * settings needed for precise_ip > 1 which implies
 491  * using the LBR to capture ALL taken branches at the
 492  * priv levels of the measurement
 493  */
 494 static inline int precise_br_compat(struct perf_event *event)
 495 {
 496         u64 m = event->attr.branch_sample_type;
 497         u64 b = 0;
 498
 499         /* must capture all branches */
 500         if (!(m & PERF_SAMPLE_BRANCH_ANY))
 501                 return 0;
 502
 503         m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
 504
 505         if (!event->attr.exclude_user)
 506                 b |= PERF_SAMPLE_BRANCH_USER;
 507
 508         if (!event->attr.exclude_kernel)
 509                 b |= PERF_SAMPLE_BRANCH_KERNEL;
 510
 511         /*
 512          * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
 513          */
 514
 515         return m == b;
 516 }
 517
 518 int x86_pmu_max_precise(void)
 519 {
 520         int precise = 0;
 521
 522         /* Support for constant skid */
 523         if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
 524                 precise++;
 525
 526                 /* Support for IP fixup */
 527                 if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
 528                         precise++;
 529
 530                 if (x86_pmu.pebs_prec_dist)
 531                         precise++;
 532         }
 533         return precise;
 534 }
 535
 536 int x86_pmu_hw_config(struct perf_event *event)
 537 {
 538         if (event->attr.precise_ip) {
 539                 int precise = x86_pmu_max_precise();
 540
 541                 if (event->attr.precise_ip > precise)
 542                         return -EOPNOTSUPP;
 543
 544                 /* There's no sense in having PEBS for non sampling events: */
 545                 if (!is_sampling_event(event))
 546                         return -EINVAL;
 547         }
 548         /*
 549          * check that PEBS LBR correction does not conflict with
 550          * whatever the user is asking with attr->branch_sample_type
 551          */
 552         if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
 553                 u64 *br_type = &event->attr.branch_sample_type;
 554
 555                 if (has_branch_stack(event)) {
 556                         if (!precise_br_compat(event))
 557                                 return -EOPNOTSUPP;
 558
 559                         /* branch_sample_type is compatible */
 560
 561                 } else {
 562                         /*
 563                          * user did not specify  branch_sample_type
 564                          *
 565                          * For PEBS fixups, we capture all
 566                          * the branches at the priv level of the
 567                          * event.
 568                          */
 569                         *br_type = PERF_SAMPLE_BRANCH_ANY;
 570
 571                         if (!event->attr.exclude_user)
 572                                 *br_type |= PERF_SAMPLE_BRANCH_USER;
 573
 574                         if (!event->attr.exclude_kernel)
 575                                 *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
 576                 }
 577         }
 578
 579         if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
 580                 event->attach_state |= PERF_ATTACH_TASK_DATA;
 581
 582         /*
 583          * Generate PMC IRQs:
 584          * (keep 'enabled' bit clear for now)
 585          */
 586         event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
 587
 588         /*
 589          * Count user and OS events unless requested not to
 590          */
 591         if (!event->attr.exclude_user)
 592                 event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
 593         if (!event->attr.exclude_kernel)
 594                 event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
 595
 596         if (event->attr.type == PERF_TYPE_RAW)
 597                 event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
 598
 599         if (event->attr.sample_period && x86_pmu.limit_period) {
 600                 if (x86_pmu.limit_period(event, event->attr.sample_period) >
 601                                 event->attr.sample_period)
 602                         return -EINVAL;
 603         }
 604
 605         /* sample_regs_user never support XMM registers */
 606         if (unlikely(event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK))
 607                 return -EINVAL;
 608         /*
 609          * Besides the general purpose registers, XMM registers may
 610          * be collected in PEBS on some platforms, e.g. Icelake
 611          */
 612         if (unlikely(event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK)) {
 613                 if (!(event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS))
 614                         return -EINVAL;
 615
 616                 if (!event->attr.precise_ip)
 617                         return -EINVAL;
 618         }
 619
 620         return x86_setup_perfctr(event);
 621 }
 622
 623 /*
 624  * Setup the hardware configuration for a given attr_type
 625  */
 626 static int __x86_pmu_event_init(struct perf_event *event)
 627 {
 628         int err;
 629
 630         if (!x86_pmu_initialized())
 631                 return -ENODEV;
 632
 633         err = x86_reserve_hardware();
 634         if (err)
 635                 return err;
 636
 637         atomic_inc(&active_events);
 638         event->destroy = hw_perf_event_destroy;
 639
 640         event->hw.idx = -1;
 641         event->hw.last_cpu = -1;
 642         event->hw.last_tag = ~0ULL;
 643
 644         /* mark unused */
 645         event->hw.extra_reg.idx = EXTRA_REG_NONE;
 646         event->hw.branch_reg.idx = EXTRA_REG_NONE;
 647
 648         return x86_pmu.hw_config(event);
 649 }
 650
 651 void x86_pmu_disable_all(void)
 652 {
 653         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 654         int idx;
 655
 656         for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 657                 struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
 658                 u64 val;
 659
 660                 if (!test_bit(idx, cpuc->active_mask))
 661                         continue;
 662                 rdmsrl(x86_pmu_config_addr(idx), val);
 663                 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
 664                         continue;
 665                 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
 666                 wrmsrl(x86_pmu_config_addr(idx), val);
 667                 if (is_counter_pair(hwc))
 668                         wrmsrl(x86_pmu_config_addr(idx + 1), 0);
 669         }
 670 }
 671
 672 /*
 673  * There may be PMI landing after enabled=0. The PMI hitting could be before or
 674  * after disable_all.
 675  *
 676  * If PMI hits before disable_all, the PMU will be disabled in the NMI handler.
 677  * It will not be re-enabled in the NMI handler again, because enabled=0. After
 678  * handling the NMI, disable_all will be called, which will not change the
 679  * state either. If PMI hits after disable_all, the PMU is already disabled
 680  * before entering NMI handler. The NMI handler will not change the state
 681  * either.
 682  *
 683  * So either situation is harmless.
 684  */
 685 static void x86_pmu_disable(struct pmu *pmu)
 686 {
 687         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 688
 689         if (!x86_pmu_initialized())
 690                 return;
 691
 692         if (!cpuc->enabled)
 693                 return;
 694
 695         cpuc->n_added = 0;
 696         cpuc->enabled = 0;
 697         barrier();
 698
 699         static_call(x86_pmu_disable_all)();
 700 }
 701
 702 void x86_pmu_enable_all(int added)
 703 {
 704         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 705         int idx;
 706
 707         for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 708                 struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
 709
 710                 if (!test_bit(idx, cpuc->active_mask))
 711                         continue;
 712
 713                 __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
 714         }
 715 }
 716
 717 static inline int is_x86_event(struct perf_event *event)
 718 {
 719         return event->pmu == &pmu;
 720 }
 721
 722 struct pmu *x86_get_pmu(unsigned int cpu)
 723 {
 724         struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
 725
 726         /*
 727          * All CPUs of the hybrid type have been offline.
 728          * The x86_get_pmu() should not be invoked.
 729          */
 730         if (WARN_ON_ONCE(!cpuc->pmu))
 731                 return &pmu;
 732
 733         return cpuc->pmu;
 734 }
 735 /*
 736  * Event scheduler state:
 737  *
 738  * Assign events iterating over all events and counters, beginning
 739  * with events with least weights first. Keep the current iterator
 740  * state in struct sched_state.
 741  */
 742 struct sched_state {
 743         int     weight;
 744         int     event;          /* event index */
 745         int     counter;        /* counter index */
 746         int     unassigned;     /* number of events to be assigned left */
 747         int     nr_gp;          /* number of GP counters used */
 748         u64     used;
 749 };
 750
 751 /* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
 752 #define SCHED_STATES_MAX        2
 753
 754 struct perf_sched {
 755         int                     max_weight;
 756         int                     max_events;
 757         int                     max_gp;
 758         int                     saved_states;
 759         struct event_constraint **constraints;
 760         struct sched_state      state;
 761         struct sched_state      saved[SCHED_STATES_MAX];
 762 };
 763
 764 /*
 765  * Initialize interator that runs through all events and counters.
 766  */
 767 static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
 768                             int num, int wmin, int wmax, int gpmax)
 769 {
 770         int idx;
 771
 772         memset(sched, 0, sizeof(*sched));
 773         sched->max_events       = num;
 774         sched->max_weight       = wmax;
 775         sched->max_gp           = gpmax;
 776         sched->constraints      = constraints;
 777
 778         for (idx = 0; idx < num; idx++) {
 779                 if (constraints[idx]->weight == wmin)
 780                         break;
 781         }
 782
 783         sched->state.event      = idx;          /* start with min weight */
 784         sched->state.weight     = wmin;
 785         sched->state.unassigned = num;
 786 }
 787
 788 static void perf_sched_save_state(struct perf_sched *sched)
 789 {
 790         if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
 791                 return;
 792
 793         sched->saved[sched->saved_states] = sched->state;
 794         sched->saved_states++;
 795 }
 796
 797 static bool perf_sched_restore_state(struct perf_sched *sched)
 798 {
 799         if (!sched->saved_states)
 800                 return false;
 801
 802         sched->saved_states--;
 803         sched->state = sched->saved[sched->saved_states];
 804
 805         /* this assignment didn't work out */
 806         /* XXX broken vs EVENT_PAIR */
 807         sched->state.used &= ~BIT_ULL(sched->state.counter);
 808
 809         /* try the next one */
 810         sched->state.counter++;
 811
 812         return true;
 813 }
 814
 815 /*
 816  * Select a counter for the current event to schedule. Return true on
 817  * success.
 818  */
 819 static bool __perf_sched_find_counter(struct perf_sched *sched)
 820 {
 821         struct event_constraint *c;
 822         int idx;
 823
 824         if (!sched->state.unassigned)
 825                 return false;
 826
 827         if (sched->state.event >= sched->max_events)
 828                 return false;
 829
 830         c = sched->constraints[sched->state.event];
 831         /* Prefer fixed purpose counters */
 832         if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
 833                 idx = INTEL_PMC_IDX_FIXED;
 834                 for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
 835                         u64 mask = BIT_ULL(idx);
 836
 837                         if (sched->state.used & mask)
 838                                 continue;
 839
 840                         sched->state.used |= mask;
 841                         goto done;
 842                 }
 843         }
 844
 845         /* Grab the first unused counter starting with idx */
 846         idx = sched->state.counter;
 847         for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
 848                 u64 mask = BIT_ULL(idx);
 849
 850                 if (c->flags & PERF_X86_EVENT_PAIR)
 851                         mask |= mask << 1;
 852
 853                 if (sched->state.used & mask)
 854                         continue;
 855
 856                 if (sched->state.nr_gp++ >= sched->max_gp)
 857                         return false;
 858
 859                 sched->state.used |= mask;
 860                 goto done;
 861         }
 862
 863         return false;
 864
 865 done:
 866         sched->state.counter = idx;
 867
 868         if (c->overlap)
 869                 perf_sched_save_state(sched);
 870
 871         return true;
 872 }
 873
 874 static bool perf_sched_find_counter(struct perf_sched *sched)
 875 {
 876         while (!__perf_sched_find_counter(sched)) {
 877                 if (!perf_sched_restore_state(sched))
 878                         return false;
 879         }
 880
 881         return true;
 882 }
 883
 884 /*
 885  * Go through all unassigned events and find the next one to schedule.
 886  * Take events with the least weight first. Return true on success.
 887  */
 888 static bool perf_sched_next_event(struct perf_sched *sched)
 889 {
 890         struct event_constraint *c;
 891
 892         if (!sched->state.unassigned || !--sched->state.unassigned)
 893                 return false;
 894
 895         do {
 896                 /* next event */
 897                 sched->state.event++;
 898                 if (sched->state.event >= sched->max_events) {
 899                         /* next weight */
 900                         sched->state.event = 0;
 901                         sched->state.weight++;
 902                         if (sched->state.weight > sched->max_weight)
 903                                 return false;
 904                 }
 905                 c = sched->constraints[sched->state.event];
 906         } while (c->weight != sched->state.weight);
 907
 908         sched->state.counter = 0;       /* start with first counter */
 909
 910         return true;
 911 }
 912
 913 /*
 914  * Assign a counter for each event.
 915  */
 916 int perf_assign_events(struct event_constraint **constraints, int n,
 917                         int wmin, int wmax, int gpmax, int *assign)
 918 {
 919         struct perf_sched sched;
 920
 921         perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
 922
 923         do {
 924                 if (!perf_sched_find_counter(&sched))
 925                         break;  /* failed */
 926                 if (assign)
 927                         assign[sched.state.event] = sched.state.counter;
 928         } while (perf_sched_next_event(&sched));
 929
 930         return sched.state.unassigned;
 931 }
 932 EXPORT_SYMBOL_GPL(perf_assign_events);
 933
 934 int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 935 {
 936         struct event_constraint *c;
 937         struct perf_event *e;
 938         int n0, i, wmin, wmax, unsched = 0;
 939         struct hw_perf_event *hwc;
 940         u64 used_mask = 0;
 941
 942         /*
 943          * Compute the number of events already present; see x86_pmu_add(),
 944          * validate_group() and x86_pmu_commit_txn(). For the former two
 945          * cpuc->n_events hasn't been updated yet, while for the latter
 946          * cpuc->n_txn contains the number of events added in the current
 947          * transaction.
 948          */
 949         n0 = cpuc->n_events;
 950         if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
 951                 n0 -= cpuc->n_txn;
 952
 953         static_call_cond(x86_pmu_start_scheduling)(cpuc);
 954
 955         for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
 956                 c = cpuc->event_constraint[i];
 957
 958                 /*
 959                  * Previously scheduled events should have a cached constraint,
 960                  * while new events should not have one.
 961                  */
 962                 WARN_ON_ONCE((c && i >= n0) || (!c && i < n0));
 963
 964                 /*
 965                  * Request constraints for new events; or for those events that
 966                  * have a dynamic constraint -- for those the constraint can
 967                  * change due to external factors (sibling state, allow_tfa).
 968                  */
 969                 if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) {
 970                         c = static_call(x86_pmu_get_event_constraints)(cpuc, i, cpuc->event_list[i]);
 971                         cpuc->event_constraint[i] = c;
 972                 }
 973
 974                 wmin = min(wmin, c->weight);
 975                 wmax = max(wmax, c->weight);
 976         }
 977
 978         /*
 979          * fastpath, try to reuse previous register
 980          */
 981         for (i = 0; i < n; i++) {
 982                 u64 mask;
 983
 984                 hwc = &cpuc->event_list[i]->hw;
 985                 c = cpuc->event_constraint[i];
 986
 987                 /* never assigned */
 988                 if (hwc->idx == -1)
 989                         break;
 990
 991                 /* constraint still honored */
 992                 if (!test_bit(hwc->idx, c->idxmsk))
 993                         break;
 994
 995                 mask = BIT_ULL(hwc->idx);
 996                 if (is_counter_pair(hwc))
 997                         mask |= mask << 1;
 998
 999                 /* not already used */
1000                 if (used_mask & mask)
1001                         break;
1002
1003                 used_mask |= mask;
1004
1005                 if (assign)
1006                         assign[i] = hwc->idx;
1007         }
1008
1009         /* slow path */
1010         if (i != n) {
1011                 int gpmax = x86_pmu.num_counters;
1012
1013                 /*
1014                  * Do not allow scheduling of more than half the available
1015                  * generic counters.
1016                  *
1017                  * This helps avoid counter starvation of sibling thread by
1018                  * ensuring at most half the counters cannot be in exclusive
1019                  * mode. There is no designated counters for the limits. Any
1020                  * N/2 counters can be used. This helps with events with
1021                  * specific counter constraints.
1022                  */
1023                 if (is_ht_workaround_enabled() && !cpuc->is_fake &&
1024                     READ_ONCE(cpuc->excl_cntrs->exclusive_present))
1025                         gpmax /= 2;
1026
1027                 /*
1028                  * Reduce the amount of available counters to allow fitting
1029                  * the extra Merge events needed by large increment events.
1030                  */
1031                 if (x86_pmu.flags & PMU_FL_PAIR) {
1032                         gpmax = x86_pmu.num_counters - cpuc->n_pair;
1033                         WARN_ON(gpmax <= 0);
1034                 }
1035
1036                 unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
1037                                              wmax, gpmax, assign);
1038         }
1039
1040         /*
1041          * In case of success (unsched = 0), mark events as committed,
1042          * so we do not put_constraint() in case new events are added
1043          * and fail to be scheduled
1044          *
1045          * We invoke the lower level commit callback to lock the resource
1046          *
1047          * We do not need to do all of this in case we are called to
1048          * validate an event group (assign == NULL)
1049          */
1050         if (!unsched && assign) {
1051                 for (i = 0; i < n; i++) {
1052                         e = cpuc->event_list[i];
1053                         static_call_cond(x86_pmu_commit_scheduling)(cpuc, i, assign[i]);
1054                 }
1055         } else {
1056                 for (i = n0; i < n; i++) {
1057                         e = cpuc->event_list[i];
1058
1059                         /*
1060                          * release events that failed scheduling
1061                          */
1062                         static_call_cond(x86_pmu_put_event_constraints)(cpuc, e);
1063
1064                         cpuc->event_constraint[i] = NULL;
1065                 }
1066         }
1067
1068         static_call_cond(x86_pmu_stop_scheduling)(cpuc);
1069
1070         return unsched ? -EINVAL : 0;
1071 }
1072
1073 static int add_nr_metric_event(struct cpu_hw_events *cpuc,
1074                                struct perf_event *event)
1075 {
1076         if (is_metric_event(event)) {
1077                 if (cpuc->n_metric == INTEL_TD_METRIC_NUM)
1078                         return -EINVAL;
1079                 cpuc->n_metric++;
1080                 cpuc->n_txn_metric++;
1081         }
1082
1083         return 0;
1084 }
1085
1086 static void del_nr_metric_event(struct cpu_hw_events *cpuc,
1087                                 struct perf_event *event)
1088 {
1089         if (is_metric_event(event))
1090                 cpuc->n_metric--;
1091 }
1092
1093 static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event,
1094                          int max_count, int n)
1095 {
1096
1097         if (x86_pmu.intel_cap.perf_metrics && add_nr_metric_event(cpuc, event))
1098                 return -EINVAL;
1099
1100         if (n >= max_count + cpuc->n_metric)
1101                 return -EINVAL;
1102
1103         cpuc->event_list[n] = event;
1104         if (is_counter_pair(&event->hw)) {
1105                 cpuc->n_pair++;
1106                 cpuc->n_txn_pair++;
1107         }
1108
1109         return 0;
1110 }
1111
1112 /*
1113  * dogrp: true if must collect siblings events (group)
1114  * returns total number of events and error code
1115  */
1116 static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
1117 {
1118         struct perf_event *event;
1119         int n, max_count;
1120
1121         max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
1122
1123         /* current number of events already accepted */
1124         n = cpuc->n_events;
1125         if (!cpuc->n_events)
1126                 cpuc->pebs_output = 0;
1127
1128         if (!cpuc->is_fake && leader->attr.precise_ip) {
1129                 /*
1130                  * For PEBS->PT, if !aux_event, the group leader (PT) went
1131                  * away, the group was broken down and this singleton event
1132                  * can't schedule any more.
1133                  */
1134                 if (is_pebs_pt(leader) && !leader->aux_event)
1135                         return -EINVAL;
1136
1137                 /*
1138                  * pebs_output: 0: no PEBS so far, 1: PT, 2: DS
1139                  */
1140                 if (cpuc->pebs_output &&
1141                     cpuc->pebs_output != is_pebs_pt(leader) + 1)
1142                         return -EINVAL;
1143
1144                 cpuc->pebs_output = is_pebs_pt(leader) + 1;
1145         }
1146
1147         if (is_x86_event(leader)) {
1148                 if (collect_event(cpuc, leader, max_count, n))
1149                         return -EINVAL;
1150                 n++;
1151         }
1152
1153         if (!dogrp)
1154                 return n;
1155
1156         for_each_sibling_event(event, leader) {
1157                 if (!is_x86_event(event) || event->state <= PERF_EVENT_STATE_OFF)
1158                         continue;
1159
1160                 if (collect_event(cpuc, event, max_count, n))
1161                         return -EINVAL;
1162
1163                 n++;
1164         }
1165         return n;
1166 }
1167
1168 static inline void x86_assign_hw_event(struct perf_event *event,
1169                                 struct cpu_hw_events *cpuc, int i)
1170 {
1171         struct hw_perf_event *hwc = &event->hw;
1172         int idx;
1173
1174         idx = hwc->idx = cpuc->assign[i];
1175         hwc->last_cpu = smp_processor_id();
1176         hwc->last_tag = ++cpuc->tags[i];
1177
1178         switch (hwc->idx) {
1179         case INTEL_PMC_IDX_FIXED_BTS:
1180         case INTEL_PMC_IDX_FIXED_VLBR:
1181                 hwc->config_base = 0;
1182                 hwc->event_base = 0;
1183                 break;
1184
1185         case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END:
1186                 /* All the metric events are mapped onto the fixed counter 3. */
1187                 idx = INTEL_PMC_IDX_FIXED_SLOTS;
1188                 /* fall through */
1189         case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1:
1190                 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
1191                 hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 +
1192                                 (idx - INTEL_PMC_IDX_FIXED);
1193                 hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) |
1194                                         INTEL_PMC_FIXED_RDPMC_BASE;
1195                 break;
1196
1197         default:
1198                 hwc->config_base = x86_pmu_config_addr(hwc->idx);
1199                 hwc->event_base  = x86_pmu_event_addr(hwc->idx);
1200                 hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
1201                 break;
1202         }
1203 }
1204
1205 /**
1206  * x86_perf_rdpmc_index - Return PMC counter used for event
1207  * @event: the perf_event to which the PMC counter was assigned
1208  *
1209  * The counter assigned to this performance event may change if interrupts
1210  * are enabled. This counter should thus never be used while interrupts are
1211  * enabled. Before this function is used to obtain the assigned counter the
1212  * event should be checked for validity using, for example,
1213  * perf_event_read_local(), within the same interrupt disabled section in
1214  * which this counter is planned to be used.
1215  *
1216  * Return: The index of the performance monitoring counter assigned to
1217  * @perf_event.
1218  */
1219 int x86_perf_rdpmc_index(struct perf_event *event)
1220 {
1221         lockdep_assert_irqs_disabled();
1222
1223         return event->hw.event_base_rdpmc;
1224 }
1225
1226 static inline int match_prev_assignment(struct hw_perf_event *hwc,
1227                                         struct cpu_hw_events *cpuc,
1228                                         int i)
1229 {
1230         return hwc->idx == cpuc->assign[i] &&
1231                 hwc->last_cpu == smp_processor_id() &&
1232                 hwc->last_tag == cpuc->tags[i];
1233 }
1234
1235 static void x86_pmu_start(struct perf_event *event, int flags);
1236
1237 static void x86_pmu_enable(struct pmu *pmu)
1238 {
1239         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1240         struct perf_event *event;
1241         struct hw_perf_event *hwc;
1242         int i, added = cpuc->n_added;
1243
1244         if (!x86_pmu_initialized())
1245                 return;
1246
1247         if (cpuc->enabled)
1248                 return;
1249
1250         if (cpuc->n_added) {
1251                 int n_running = cpuc->n_events - cpuc->n_added;
1252                 /*
1253                  * apply assignment obtained either from
1254                  * hw_perf_group_sched_in() or x86_pmu_enable()
1255                  *
1256                  * step1: save events moving to new counters
1257                  */
1258                 for (i = 0; i < n_running; i++) {
1259                         event = cpuc->event_list[i];
1260                         hwc = &event->hw;
1261
1262                         /*
1263                          * we can avoid reprogramming counter if:
1264                          * - assigned same counter as last time
1265                          * - running on same CPU as last time
1266                          * - no other event has used the counter since
1267                          */
1268                         if (hwc->idx == -1 ||
1269                             match_prev_assignment(hwc, cpuc, i))
1270                                 continue;
1271
1272                         /*
1273                          * Ensure we don't accidentally enable a stopped
1274                          * counter simply because we rescheduled.
1275                          */
1276                         if (hwc->state & PERF_HES_STOPPED)
1277                                 hwc->state |= PERF_HES_ARCH;
1278
1279                         x86_pmu_stop(event, PERF_EF_UPDATE);
1280                 }
1281
1282                 /*
1283                  * step2: reprogram moved events into new counters
1284                  */
1285                 for (i = 0; i < cpuc->n_events; i++) {
1286                         event = cpuc->event_list[i];
1287                         hwc = &event->hw;
1288
1289                         if (!match_prev_assignment(hwc, cpuc, i))
1290                                 x86_assign_hw_event(event, cpuc, i);
1291                         else if (i < n_running)
1292                                 continue;
1293
1294                         if (hwc->state & PERF_HES_ARCH)
1295                                 continue;
1296
1297                         x86_pmu_start(event, PERF_EF_RELOAD);
1298                 }
1299                 cpuc->n_added = 0;
1300                 perf_events_lapic_init();
1301         }
1302
1303         cpuc->enabled = 1;
1304         barrier();
1305
1306         static_call(x86_pmu_enable_all)(added);
1307 }
1308
1309 static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
1310
1311 /*
1312  * Set the next IRQ period, based on the hwc->period_left value.
1313  * To be called with the event disabled in hw:
1314  */
1315 int x86_perf_event_set_period(struct perf_event *event)
1316 {
1317         struct hw_perf_event *hwc = &event->hw;
1318         s64 left = local64_read(&hwc->period_left);
1319         s64 period = hwc->sample_period;
1320         int ret = 0, idx = hwc->idx;
1321
1322         if (unlikely(!hwc->event_base))
1323                 return 0;
1324
1325         if (unlikely(is_topdown_count(event)) &&
1326             x86_pmu.set_topdown_event_period)
1327                 return x86_pmu.set_topdown_event_period(event);
1328
1329         /*
1330          * If we are way outside a reasonable range then just skip forward:
1331          */
1332         if (unlikely(left <= -period)) {
1333                 left = period;
1334                 local64_set(&hwc->period_left, left);
1335                 hwc->last_period = period;
1336                 ret = 1;
1337         }
1338
1339         if (unlikely(left <= 0)) {
1340                 left += period;
1341                 local64_set(&hwc->period_left, left);
1342                 hwc->last_period = period;
1343                 ret = 1;
1344         }
1345         /*
1346          * Quirk: certain CPUs dont like it if just 1 hw_event is left:
1347          */
1348         if (unlikely(left < 2))
1349                 left = 2;
1350
1351         if (left > x86_pmu.max_period)
1352                 left = x86_pmu.max_period;
1353
1354         if (x86_pmu.limit_period)
1355                 left = x86_pmu.limit_period(event, left);
1356
1357         per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
1358
1359         /*
1360          * The hw event starts counting from this event offset,
1361          * mark it to be able to extra future deltas:
1362          */
1363         local64_set(&hwc->prev_count, (u64)-left);
1364
1365         wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
1366
1367         /*
1368          * Sign extend the Merge event counter's upper 16 bits since
1369          * we currently declare a 48-bit counter width
1370          */
1371         if (is_counter_pair(hwc))
1372                 wrmsrl(x86_pmu_event_addr(idx + 1), 0xffff);
1373
1374         /*
1375          * Due to erratum on certan cpu we need
1376          * a second write to be sure the register
1377          * is updated properly
1378          */
1379         if (x86_pmu.perfctr_second_write) {
1380                 wrmsrl(hwc->event_base,
1381                         (u64)(-left) & x86_pmu.cntval_mask);
1382         }
1383
1384         perf_event_update_userpage(event);
1385
1386         return ret;
1387 }
1388
1389 void x86_pmu_enable_event(struct perf_event *event)
1390 {
1391         if (__this_cpu_read(cpu_hw_events.enabled))
1392                 __x86_pmu_enable_event(&event->hw,
1393                                        ARCH_PERFMON_EVENTSEL_ENABLE);
1394 }
1395
1396 /*
1397  * Add a single event to the PMU.
1398  *
1399  * The event is added to the group of enabled events
1400  * but only if it can be scheduled with existing events.
1401  */
1402 static int x86_pmu_add(struct perf_event *event, int flags)
1403 {
1404         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1405         struct hw_perf_event *hwc;
1406         int assign[X86_PMC_IDX_MAX];
1407         int n, n0, ret;
1408
1409         hwc = &event->hw;
1410
1411         n0 = cpuc->n_events;
1412         ret = n = collect_events(cpuc, event, false);
1413         if (ret < 0)
1414                 goto out;
1415
1416         hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
1417         if (!(flags & PERF_EF_START))
1418                 hwc->state |= PERF_HES_ARCH;
1419
1420         /*
1421          * If group events scheduling transaction was started,
1422          * skip the schedulability test here, it will be performed
1423          * at commit time (->commit_txn) as a whole.
1424          *
1425          * If commit fails, we'll call ->del() on all events
1426          * for which ->add() was called.
1427          */
1428         if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
1429                 goto done_collect;
1430
1431         ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign);
1432         if (ret)
1433                 goto out;
1434         /*
1435          * copy new assignment, now we know it is possible
1436          * will be used by hw_perf_enable()
1437          */
1438         memcpy(cpuc->assign, assign, n*sizeof(int));
1439
1440 done_collect:
1441         /*
1442          * Commit the collect_events() state. See x86_pmu_del() and
1443          * x86_pmu_*_txn().
1444          */
1445         cpuc->n_events = n;
1446         cpuc->n_added += n - n0;
1447         cpuc->n_txn += n - n0;
1448
1449         /*
1450          * This is before x86_pmu_enable() will call x86_pmu_start(),
1451          * so we enable LBRs before an event needs them etc..
1452          */
1453         static_call_cond(x86_pmu_add)(event);
1454
1455         ret = 0;
1456 out:
1457         return ret;
1458 }
1459
1460 static void x86_pmu_start(struct perf_event *event, int flags)
1461 {
1462         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1463         int idx = event->hw.idx;
1464
1465         if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
1466                 return;
1467
1468         if (WARN_ON_ONCE(idx == -1))
1469                 return;
1470
1471         if (flags & PERF_EF_RELOAD) {
1472                 WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
1473                 x86_perf_event_set_period(event);
1474         }
1475
1476         event->hw.state = 0;
1477
1478         cpuc->events[idx] = event;
1479         __set_bit(idx, cpuc->active_mask);
1480         __set_bit(idx, cpuc->running);
1481         static_call(x86_pmu_enable)(event);
1482         perf_event_update_userpage(event);
1483 }
1484
1485 void perf_event_print_debug(void)
1486 {
1487         u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1488         u64 pebs, debugctl;
1489         struct cpu_hw_events *cpuc;
1490         unsigned long flags;
1491         int cpu, idx;
1492
1493         if (!x86_pmu.num_counters)
1494                 return;
1495
1496         local_irq_save(flags);
1497
1498         cpu = smp_processor_id();
1499         cpuc = &per_cpu(cpu_hw_events, cpu);
1500
1501         if (x86_pmu.version >= 2) {
1502                 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
1503                 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
1504                 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
1505                 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1506
1507                 pr_info("\n");
1508                 pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
1509                 pr_info("CPU#%d: status:     %016llx\n", cpu, status);
1510                 pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
1511                 pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
1512                 if (x86_pmu.pebs_constraints) {
1513                         rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
1514                         pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
1515                 }
1516                 if (x86_pmu.lbr_nr) {
1517                         rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
1518                         pr_info("CPU#%d: debugctl:   %016llx\n", cpu, debugctl);
1519                 }
1520         }
1521         pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1522
1523         for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1524                 rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
1525                 rdmsrl(x86_pmu_event_addr(idx), pmc_count);
1526
1527                 prev_left = per_cpu(pmc_prev_left[idx], cpu);
1528
1529                 pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
1530                         cpu, idx, pmc_ctrl);
1531                 pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
1532                         cpu, idx, pmc_count);
1533                 pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
1534                         cpu, idx, prev_left);
1535         }
1536         for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1537                 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1538
1539                 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1540                         cpu, idx, pmc_count);
1541         }
1542         local_irq_restore(flags);
1543 }
1544
1545 void x86_pmu_stop(struct perf_event *event, int flags)
1546 {
1547         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1548         struct hw_perf_event *hwc = &event->hw;
1549
1550         if (test_bit(hwc->idx, cpuc->active_mask)) {
1551                 static_call(x86_pmu_disable)(event);
1552                 __clear_bit(hwc->idx, cpuc->active_mask);
1553                 cpuc->events[hwc->idx] = NULL;
1554                 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
1555                 hwc->state |= PERF_HES_STOPPED;
1556         }
1557
1558         if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
1559                 /*
1560                  * Drain the remaining delta count out of a event
1561                  * that we are disabling:
1562                  */
1563                 x86_perf_event_update(event);
1564                 hwc->state |= PERF_HES_UPTODATE;
1565         }
1566 }
1567
1568 static void x86_pmu_del(struct perf_event *event, int flags)
1569 {
1570         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1571         int i;
1572
1573         /*
1574          * If we're called during a txn, we only need to undo x86_pmu.add.
1575          * The events never got scheduled and ->cancel_txn will truncate
1576          * the event_list.
1577          *
1578          * XXX assumes any ->del() called during a TXN will only be on
1579          * an event added during that same TXN.
1580          */
1581         if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
1582                 goto do_del;
1583
1584         /*
1585          * Not a TXN, therefore cleanup properly.
1586          */
1587         x86_pmu_stop(event, PERF_EF_UPDATE);
1588
1589         for (i = 0; i < cpuc->n_events; i++) {
1590                 if (event == cpuc->event_list[i])
1591                         break;
1592         }
1593
1594         if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
1595                 return;
1596
1597         /* If we have a newly added event; make sure to decrease n_added. */
1598         if (i >= cpuc->n_events - cpuc->n_added)
1599                 --cpuc->n_added;
1600
1601         static_call_cond(x86_pmu_put_event_constraints)(cpuc, event);
1602
1603         /* Delete the array entry. */
1604         while (++i < cpuc->n_events) {
1605                 cpuc->event_list[i-1] = cpuc->event_list[i];
1606                 cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
1607         }
1608         cpuc->event_constraint[i-1] = NULL;
1609         --cpuc->n_events;
1610         if (x86_pmu.intel_cap.perf_metrics)
1611                 del_nr_metric_event(cpuc, event);
1612
1613         perf_event_update_userpage(event);
1614
1615 do_del:
1616
1617         /*
1618          * This is after x86_pmu_stop(); so we disable LBRs after any
1619          * event can need them etc..
1620          */
1621         static_call_cond(x86_pmu_del)(event);
1622 }
1623
1624 int x86_pmu_handle_irq(struct pt_regs *regs)
1625 {
1626         struct perf_sample_data data;
1627         struct cpu_hw_events *cpuc;
1628         struct perf_event *event;
1629         int idx, handled = 0;
1630         u64 val;
1631
1632         cpuc = this_cpu_ptr(&cpu_hw_events);
1633
1634         /*
1635          * Some chipsets need to unmask the LVTPC in a particular spot
1636          * inside the nmi handler.  As a result, the unmasking was pushed
1637          * into all the nmi handlers.
1638          *
1639          * This generic handler doesn't seem to have any issues where the
1640          * unmasking occurs so it was left at the top.
1641          */
1642         apic_write(APIC_LVTPC, APIC_DM_NMI);
1643
1644         for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1645                 if (!test_bit(idx, cpuc->active_mask))
1646                         continue;
1647
1648                 event = cpuc->events[idx];
1649
1650                 val = x86_perf_event_update(event);
1651                 if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1652                         continue;
1653
1654                 /*
1655                  * event overflow
1656                  */
1657                 handled++;
1658                 perf_sample_data_init(&data, 0, event->hw.last_period);
1659
1660                 if (!x86_perf_event_set_period(event))
1661                         continue;
1662
1663                 if (perf_event_overflow(event, &data, regs))
1664                         x86_pmu_stop(event, 0);
1665         }
1666
1667         if (handled)
1668                 inc_irq_stat(apic_perf_irqs);
1669
1670         return handled;
1671 }
1672
1673 void perf_events_lapic_init(void)
1674 {
1675         if (!x86_pmu.apic || !x86_pmu_initialized())
1676                 return;
1677
1678         /*
1679          * Always use NMI for PMU
1680          */
1681         apic_write(APIC_LVTPC, APIC_DM_NMI);
1682 }
1683
1684 static int
1685 perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
1686 {
1687         u64 start_clock;
1688         u64 finish_clock;
1689         int ret;
1690
1691         /*
1692          * All PMUs/events that share this PMI handler should make sure to
1693          * increment active_events for their events.
1694          */
1695         if (!atomic_read(&active_events))
1696                 return NMI_DONE;
1697
1698         start_clock = sched_clock();
1699         ret = static_call(x86_pmu_handle_irq)(regs);
1700         finish_clock = sched_clock();
1701
1702         perf_sample_event_took(finish_clock - start_clock);
1703
1704         return ret;
1705 }
1706 NOKPROBE_SYMBOL(perf_event_nmi_handler);
1707
1708 struct event_constraint emptyconstraint;
1709 struct event_constraint unconstrained;
1710
1711 static int x86_pmu_prepare_cpu(unsigned int cpu)
1712 {
1713         struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1714         int i;
1715
1716         for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
1717                 cpuc->kfree_on_online[i] = NULL;
1718         if (x86_pmu.cpu_prepare)
1719                 return x86_pmu.cpu_prepare(cpu);
1720         return 0;
1721 }
1722
1723 static int x86_pmu_dead_cpu(unsigned int cpu)
1724 {
1725         if (x86_pmu.cpu_dead)
1726                 x86_pmu.cpu_dead(cpu);
1727         return 0;
1728 }
1729
1730 static int x86_pmu_online_cpu(unsigned int cpu)
1731 {
1732         struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1733         int i;
1734
1735         for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
1736                 kfree(cpuc->kfree_on_online[i]);
1737                 cpuc->kfree_on_online[i] = NULL;
1738         }
1739         return 0;
1740 }
1741
1742 static int x86_pmu_starting_cpu(unsigned int cpu)
1743 {
1744         if (x86_pmu.cpu_starting)
1745                 x86_pmu.cpu_starting(cpu);
1746         return 0;
1747 }
1748
1749 static int x86_pmu_dying_cpu(unsigned int cpu)
1750 {
1751         if (x86_pmu.cpu_dying)
1752                 x86_pmu.cpu_dying(cpu);
1753         return 0;
1754 }
1755
1756 static void __init pmu_check_apic(void)
1757 {
1758         if (boot_cpu_has(X86_FEATURE_APIC))
1759                 return;
1760
1761         x86_pmu.apic = 0;
1762         pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
1763         pr_info("no hardware sampling interrupt available.\n");
1764
1765         /*
1766          * If we have a PMU initialized but no APIC
1767          * interrupts, we cannot sample hardware
1768          * events (user-space has to fall back and
1769          * sample via a hrtimer based software event):
1770          */
1771         pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
1772
1773 }
1774
1775 static struct attribute_group x86_pmu_format_group __ro_after_init = {
1776         .name = "format",
1777         .attrs = NULL,
1778 };
1779
1780 ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page)
1781 {
1782         struct perf_pmu_events_attr *pmu_attr =
1783                 container_of(attr, struct perf_pmu_events_attr, attr);
1784         u64 config = 0;
1785
1786         if (pmu_attr->id < x86_pmu.max_events)
1787                 config = x86_pmu.event_map(pmu_attr->id);
1788
1789         /* string trumps id */
1790         if (pmu_attr->event_str)
1791                 return sprintf(page, "%s", pmu_attr->event_str);
1792
1793         return x86_pmu.events_sysfs_show(page, config);
1794 }
1795 EXPORT_SYMBOL_GPL(events_sysfs_show);
1796
1797 ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
1798                           char *page)
1799 {
1800         struct perf_pmu_events_ht_attr *pmu_attr =
1801                 container_of(attr, struct perf_pmu_events_ht_attr, attr);
1802
1803         /*
1804          * Report conditional events depending on Hyper-Threading.
1805          *
1806          * This is overly conservative as usually the HT special
1807          * handling is not needed if the other CPU thread is idle.
1808          *
1809          * Note this does not (and cannot) handle the case when thread
1810          * siblings are invisible, for example with virtualization
1811          * if they are owned by some other guest.  The user tool
1812          * has to re-read when a thread sibling gets onlined later.
1813          */
1814         return sprintf(page, "%s",
1815                         topology_max_smt_threads() > 1 ?
1816                         pmu_attr->event_str_ht :
1817                         pmu_attr->event_str_noht);
1818 }
1819
1820 EVENT_ATTR(cpu-cycles,                  CPU_CYCLES              );
1821 EVENT_ATTR(instructions,                INSTRUCTIONS            );
1822 EVENT_ATTR(cache-references,            CACHE_REFERENCES        );
1823 EVENT_ATTR(cache-misses,                CACHE_MISSES            );
1824 EVENT_ATTR(branch-instructions,         BRANCH_INSTRUCTIONS     );
1825 EVENT_ATTR(branch-misses,               BRANCH_MISSES           );
1826 EVENT_ATTR(bus-cycles,                  BUS_CYCLES              );
1827 EVENT_ATTR(stalled-cycles-frontend,     STALLED_CYCLES_FRONTEND );
1828 EVENT_ATTR(stalled-cycles-backend,      STALLED_CYCLES_BACKEND  );
1829 EVENT_ATTR(ref-cycles,                  REF_CPU_CYCLES          );
1830
1831 static struct attribute *empty_attrs;
1832
1833 static struct attribute *events_attr[] = {
1834         EVENT_PTR(CPU_CYCLES),
1835         EVENT_PTR(INSTRUCTIONS),
1836         EVENT_PTR(CACHE_REFERENCES),
1837         EVENT_PTR(CACHE_MISSES),
1838         EVENT_PTR(BRANCH_INSTRUCTIONS),
1839         EVENT_PTR(BRANCH_MISSES),
1840         EVENT_PTR(BUS_CYCLES),
1841         EVENT_PTR(STALLED_CYCLES_FRONTEND),
1842         EVENT_PTR(STALLED_CYCLES_BACKEND),
1843         EVENT_PTR(REF_CPU_CYCLES),
1844         NULL,
1845 };
1846
1847 /*
1848  * Remove all undefined events (x86_pmu.event_map(id) == 0)
1849  * out of events_attr attributes.
1850  */
1851 static umode_t
1852 is_visible(struct kobject *kobj, struct attribute *attr, int idx)
1853 {
1854         struct perf_pmu_events_attr *pmu_attr;
1855
1856         if (idx >= x86_pmu.max_events)
1857                 return 0;
1858
1859         pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr.attr);
1860         /* str trumps id */
1861         return pmu_attr->event_str || x86_pmu.event_map(idx) ? attr->mode : 0;
1862 }
1863
1864 static struct attribute_group x86_pmu_events_group __ro_after_init = {
1865         .name = "events",
1866         .attrs = events_attr,
1867         .is_visible = is_visible,
1868 };
1869
1870 ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
1871 {
1872         u64 umask  = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
1873         u64 cmask  = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
1874         bool edge  = (config & ARCH_PERFMON_EVENTSEL_EDGE);
1875         bool pc    = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
1876         bool any   = (config & ARCH_PERFMON_EVENTSEL_ANY);
1877         bool inv   = (config & ARCH_PERFMON_EVENTSEL_INV);
1878         ssize_t ret;
1879
1880         /*
1881         * We have whole page size to spend and just little data
1882         * to write, so we can safely use sprintf.
1883         */
1884         ret = sprintf(page, "event=0x%02llx", event);
1885
1886         if (umask)
1887                 ret += sprintf(page + ret, ",umask=0x%02llx", umask);
1888
1889         if (edge)
1890                 ret += sprintf(page + ret, ",edge");
1891
1892         if (pc)
1893                 ret += sprintf(page + ret, ",pc");
1894
1895         if (any)
1896                 ret += sprintf(page + ret, ",any");
1897
1898         if (inv)
1899                 ret += sprintf(page + ret, ",inv");
1900
1901         if (cmask)
1902                 ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);
1903
1904         ret += sprintf(page + ret, "\n");
1905
1906         return ret;
1907 }
1908
1909 static struct attribute_group x86_pmu_attr_group;
1910 static struct attribute_group x86_pmu_caps_group;
1911
1912 static void x86_pmu_static_call_update(void)
1913 {
1914         static_call_update(x86_pmu_handle_irq, x86_pmu.handle_irq);
1915         static_call_update(x86_pmu_disable_all, x86_pmu.disable_all);
1916         static_call_update(x86_pmu_enable_all, x86_pmu.enable_all);
1917         static_call_update(x86_pmu_enable, x86_pmu.enable);
1918         static_call_update(x86_pmu_disable, x86_pmu.disable);
1919
1920         static_call_update(x86_pmu_add, x86_pmu.add);
1921         static_call_update(x86_pmu_del, x86_pmu.del);
1922         static_call_update(x86_pmu_read, x86_pmu.read);
1923
1924         static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events);
1925         static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints);
1926         static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints);
1927
1928         static_call_update(x86_pmu_start_scheduling, x86_pmu.start_scheduling);
1929         static_call_update(x86_pmu_commit_scheduling, x86_pmu.commit_scheduling);
1930         static_call_update(x86_pmu_stop_scheduling, x86_pmu.stop_scheduling);
1931
1932         static_call_update(x86_pmu_sched_task, x86_pmu.sched_task);
1933         static_call_update(x86_pmu_swap_task_ctx, x86_pmu.swap_task_ctx);
1934
1935         static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs);
1936         static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases);
1937 }
1938
1939 static void _x86_pmu_read(struct perf_event *event)
1940 {
1941         x86_perf_event_update(event);
1942 }
1943
1944 static int __init init_hw_perf_events(void)
1945 {
1946         struct x86_pmu_quirk *quirk;
1947         int err;
1948
1949         pr_info("Performance Events: ");
1950
1951         switch (boot_cpu_data.x86_vendor) {
1952         case X86_VENDOR_INTEL:
1953                 err = intel_pmu_init();
1954                 break;
1955         case X86_VENDOR_AMD:
1956                 err = amd_pmu_init();
1957                 break;
1958         case X86_VENDOR_HYGON:
1959                 err = amd_pmu_init();
1960                 x86_pmu.name = "HYGON";
1961                 break;
1962         case X86_VENDOR_ZHAOXIN:
1963         case X86_VENDOR_CENTAUR:
1964                 err = zhaoxin_pmu_init();
1965                 break;
1966         default:
1967                 err = -ENOTSUPP;
1968         }
1969         if (err != 0) {
1970                 pr_cont("no PMU driver, software events only.\n");
1971                 return 0;
1972         }
1973
1974         pmu_check_apic();
1975
1976         /* sanity check that the hardware exists or is emulated */
1977         if (!check_hw_exists())
1978                 return 0;
1979
1980         pr_cont("%s PMU driver.\n", x86_pmu.name);
1981
1982         x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
1983
1984         for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
1985                 quirk->func();
1986
1987         if (!x86_pmu.intel_ctrl)
1988                 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1989
1990         perf_events_lapic_init();
1991         register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
1992
1993         unconstrained = (struct event_constraint)
1994                 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1995                                    0, x86_pmu.num_counters, 0, 0);
1996
1997         x86_pmu_format_group.attrs = x86_pmu.format_attrs;
1998
1999         if (!x86_pmu.events_sysfs_show)
2000                 x86_pmu_events_group.attrs = &empty_attrs;
2001
2002         pmu.attr_update = x86_pmu.attr_update;
2003
2004         pr_info("... version:                %d\n",     x86_pmu.version);
2005         pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
2006         pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
2007         pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
2008         pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
2009         pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
2010         pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
2011
2012         if (!x86_pmu.read)
2013                 x86_pmu.read = _x86_pmu_read;
2014
2015         x86_pmu_static_call_update();
2016
2017         /*
2018          * Install callbacks. Core will call them for each online
2019          * cpu.
2020          */
2021         err = cpuhp_setup_state(CPUHP_PERF_X86_PREPARE, "perf/x86:prepare",
2022                                 x86_pmu_prepare_cpu, x86_pmu_dead_cpu);
2023         if (err)
2024                 return err;
2025
2026         err = cpuhp_setup_state(CPUHP_AP_PERF_X86_STARTING,
2027                                 "perf/x86:starting", x86_pmu_starting_cpu,
2028                                 x86_pmu_dying_cpu);
2029         if (err)
2030                 goto out;
2031
2032         err = cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE, "perf/x86:online",
2033                                 x86_pmu_online_cpu, NULL);
2034         if (err)
2035                 goto out1;
2036
2037         err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
2038         if (err)
2039                 goto out2;
2040
2041         return 0;
2042
2043 out2:
2044         cpuhp_remove_state(CPUHP_AP_PERF_X86_ONLINE);
2045 out1:
2046         cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING);
2047 out:
2048         cpuhp_remove_state(CPUHP_PERF_X86_PREPARE);
2049         return err;
2050 }
2051 early_initcall(init_hw_perf_events);
2052
2053 static void x86_pmu_read(struct perf_event *event)
2054 {
2055         static_call(x86_pmu_read)(event);
2056 }
2057
2058 /*
2059  * Start group events scheduling transaction
2060  * Set the flag to make pmu::enable() not perform the
2061  * schedulability test, it will be performed at commit time
2062  *
2063  * We only support PERF_PMU_TXN_ADD transactions. Save the
2064  * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
2065  * transactions.
2066  */
2067 static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
2068 {
2069         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
2070
2071         WARN_ON_ONCE(cpuc->txn_flags);          /* txn already in flight */
2072
2073         cpuc->txn_flags = txn_flags;
2074         if (txn_flags & ~PERF_PMU_TXN_ADD)
2075                 return;
2076
2077         perf_pmu_disable(pmu);
2078         __this_cpu_write(cpu_hw_events.n_txn, 0);
2079         __this_cpu_write(cpu_hw_events.n_txn_pair, 0);
2080         __this_cpu_write(cpu_hw_events.n_txn_metric, 0);
2081 }
2082
2083 /*
2084  * Stop group events scheduling transaction
2085  * Clear the flag and pmu::enable() will perform the
2086  * schedulability test.
2087  */
2088 static void x86_pmu_cancel_txn(struct pmu *pmu)
2089 {
2090         unsigned int txn_flags;
2091         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
2092
2093         WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
2094
2095         txn_flags = cpuc->txn_flags;
2096         cpuc->txn_flags = 0;
2097         if (txn_flags & ~PERF_PMU_TXN_ADD)
2098                 return;
2099
2100         /*
2101          * Truncate collected array by the number of events added in this
2102          * transaction. See x86_pmu_add() and x86_pmu_*_txn().
2103          */
2104         __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
2105         __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
2106         __this_cpu_sub(cpu_hw_events.n_pair, __this_cpu_read(cpu_hw_events.n_txn_pair));
2107         __this_cpu_sub(cpu_hw_events.n_metric, __this_cpu_read(cpu_hw_events.n_txn_metric));
2108         perf_pmu_enable(pmu);
2109 }
2110
2111 /*
2112  * Commit group events scheduling transaction
2113  * Perform the group schedulability test as a whole
2114  * Return 0 if success
2115  *
2116  * Does not cancel the transaction on failure; expects the caller to do this.
2117  */
2118 static int x86_pmu_commit_txn(struct pmu *pmu)
2119 {
2120         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
2121         int assign[X86_PMC_IDX_MAX];
2122         int n, ret;
2123
2124         WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
2125
2126         if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) {
2127                 cpuc->txn_flags = 0;
2128                 return 0;
2129         }
2130
2131         n = cpuc->n_events;
2132
2133         if (!x86_pmu_initialized())
2134                 return -EAGAIN;
2135
2136         ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign);
2137         if (ret)
2138                 return ret;
2139
2140         /*
2141          * copy new assignment, now we know it is possible
2142          * will be used by hw_perf_enable()
2143          */
2144         memcpy(cpuc->assign, assign, n*sizeof(int));
2145
2146         cpuc->txn_flags = 0;
2147         perf_pmu_enable(pmu);
2148         return 0;
2149 }
2150 /*
2151  * a fake_cpuc is used to validate event groups. Due to
2152  * the extra reg logic, we need to also allocate a fake
2153  * per_core and per_cpu structure. Otherwise, group events
2154  * using extra reg may conflict without the kernel being
2155  * able to catch this when the last event gets added to
2156  * the group.
2157  */
2158 static void free_fake_cpuc(struct cpu_hw_events *cpuc)
2159 {
2160         intel_cpuc_finish(cpuc);
2161         kfree(cpuc);
2162 }
2163
2164 static struct cpu_hw_events *allocate_fake_cpuc(void)
2165 {
2166         struct cpu_hw_events *cpuc;
2167         int cpu = raw_smp_processor_id();
2168
2169         cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
2170         if (!cpuc)
2171                 return ERR_PTR(-ENOMEM);
2172         cpuc->is_fake = 1;
2173
2174         if (intel_cpuc_prepare(cpuc, cpu))
2175                 goto error;
2176
2177         return cpuc;
2178 error:
2179         free_fake_cpuc(cpuc);
2180         return ERR_PTR(-ENOMEM);
2181 }
2182
2183 /*
2184  * validate that we can schedule this event
2185  */
2186 static int validate_event(struct perf_event *event)
2187 {
2188         struct cpu_hw_events *fake_cpuc;
2189         struct event_constraint *c;
2190         int ret = 0;
2191
2192         fake_cpuc = allocate_fake_cpuc();
2193         if (IS_ERR(fake_cpuc))
2194                 return PTR_ERR(fake_cpuc);
2195
2196         c = x86_pmu.get_event_constraints(fake_cpuc, 0, event);
2197
2198         if (!c || !c->weight)
2199                 ret = -EINVAL;
2200
2201         if (x86_pmu.put_event_constraints)
2202                 x86_pmu.put_event_constraints(fake_cpuc, event);
2203
2204         free_fake_cpuc(fake_cpuc);
2205
2206         return ret;
2207 }
2208
2209 /*
2210  * validate a single event group
2211  *
2212  * validation include:
2213  *      - check events are compatible which each other
2214  *      - events do not compete for the same counter
2215  *      - number of events <= number of counters
2216  *
2217  * validation ensures the group can be loaded onto the
2218  * PMU if it was the only group available.
2219  */
2220 static int validate_group(struct perf_event *event)
2221 {
2222         struct perf_event *leader = event->group_leader;
2223         struct cpu_hw_events *fake_cpuc;
2224         int ret = -EINVAL, n;
2225
2226         fake_cpuc = allocate_fake_cpuc();
2227         if (IS_ERR(fake_cpuc))
2228                 return PTR_ERR(fake_cpuc);
2229         /*
2230          * the event is not yet connected with its
2231          * siblings therefore we must first collect
2232          * existing siblings, then add the new event
2233          * before we can simulate the scheduling
2234          */
2235         n = collect_events(fake_cpuc, leader, true);
2236         if (n < 0)
2237                 goto out;
2238
2239         fake_cpuc->n_events = n;
2240         n = collect_events(fake_cpuc, event, false);
2241         if (n < 0)
2242                 goto out;
2243
2244         fake_cpuc->n_events = 0;
2245         ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
2246
2247 out:
2248         free_fake_cpuc(fake_cpuc);
2249         return ret;
2250 }
2251
2252 static int x86_pmu_event_init(struct perf_event *event)
2253 {
2254         struct pmu *tmp;
2255         int err;
2256
2257         switch (event->attr.type) {
2258         case PERF_TYPE_RAW:
2259         case PERF_TYPE_HARDWARE:
2260         case PERF_TYPE_HW_CACHE:
2261                 break;
2262
2263         default:
2264                 return -ENOENT;
2265         }
2266
2267         err = __x86_pmu_event_init(event);
2268         if (!err) {
2269                 /*
2270                  * we temporarily connect event to its pmu
2271                  * such that validate_group() can classify
2272                  * it as an x86 event using is_x86_event()
2273                  */
2274                 tmp = event->pmu;
2275                 event->pmu = &pmu;
2276
2277                 if (event->group_leader != event)
2278                         err = validate_group(event);
2279                 else
2280                         err = validate_event(event);
2281
2282                 event->pmu = tmp;
2283         }
2284         if (err) {
2285                 if (event->destroy)
2286                         event->destroy(event);
2287                 event->destroy = NULL;
2288         }
2289
2290         if (READ_ONCE(x86_pmu.attr_rdpmc) &&
2291             !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS))
2292                 event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
2293
2294         return err;
2295 }
2296
2297 static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
2298 {
2299         if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
2300                 return;
2301
2302         /*
2303          * This function relies on not being called concurrently in two
2304          * tasks in the same mm.  Otherwise one task could observe
2305          * perf_rdpmc_allowed > 1 and return all the way back to
2306          * userspace with CR4.PCE clear while another task is still
2307          * doing on_each_cpu_mask() to propagate CR4.PCE.
2308          *
2309          * For now, this can't happen because all callers hold mmap_lock
2310          * for write.  If this changes, we'll need a different solution.
2311          */
2312         mmap_assert_write_locked(mm);
2313
2314         if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1)
2315                 on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1);
2316 }
2317
2318 static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
2319 {
2320
2321         if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
2322                 return;
2323
2324         if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
2325                 on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1);
2326 }
2327
2328 static int x86_pmu_event_idx(struct perf_event *event)
2329 {
2330         struct hw_perf_event *hwc = &event->hw;
2331
2332         if (!(hwc->flags & PERF_X86_EVENT_RDPMC_ALLOWED))
2333                 return 0;
2334
2335         if (is_metric_idx(hwc->idx))
2336                 return INTEL_PMC_FIXED_RDPMC_METRICS + 1;
2337         else
2338                 return hwc->event_base_rdpmc + 1;
2339 }
2340
2341 static ssize_t get_attr_rdpmc(struct device *cdev,
2342                               struct device_attribute *attr,
2343                               char *buf)
2344 {
2345         return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
2346 }
2347
2348 static ssize_t set_attr_rdpmc(struct device *cdev,
2349                               struct device_attribute *attr,
2350                               const char *buf, size_t count)
2351 {
2352         unsigned long val;
2353         ssize_t ret;
2354
2355         ret = kstrtoul(buf, 0, &val);
2356         if (ret)
2357                 return ret;
2358
2359         if (val > 2)
2360                 return -EINVAL;
2361
2362         if (x86_pmu.attr_rdpmc_broken)
2363                 return -ENOTSUPP;
2364
2365         if (val != x86_pmu.attr_rdpmc) {
2366                 /*
2367                  * Changing into or out of never available or always available,
2368                  * aka perf-event-bypassing mode. This path is extremely slow,
2369                  * but only root can trigger it, so it's okay.
2370                  */
2371                 if (val == 0)
2372                         static_branch_inc(&rdpmc_never_available_key);
2373                 else if (x86_pmu.attr_rdpmc == 0)
2374                         static_branch_dec(&rdpmc_never_available_key);
2375
2376                 if (val == 2)
2377                         static_branch_inc(&rdpmc_always_available_key);
2378                 else if (x86_pmu.attr_rdpmc == 2)
2379                         static_branch_dec(&rdpmc_always_available_key);
2380
2381                 on_each_cpu(cr4_update_pce, NULL, 1);
2382                 x86_pmu.attr_rdpmc = val;
2383         }
2384
2385         return count;
2386 }
2387
2388 static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
2389
2390 static struct attribute *x86_pmu_attrs[] = {
2391         &dev_attr_rdpmc.attr,
2392         NULL,
2393 };
2394
2395 static struct attribute_group x86_pmu_attr_group __ro_after_init = {
2396         .attrs = x86_pmu_attrs,
2397 };
2398
2399 static ssize_t max_precise_show(struct device *cdev,
2400                                   struct device_attribute *attr,
2401                                   char *buf)
2402 {
2403         return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise());
2404 }
2405
2406 static DEVICE_ATTR_RO(max_precise);
2407
2408 static struct attribute *x86_pmu_caps_attrs[] = {
2409         &dev_attr_max_precise.attr,
2410         NULL
2411 };
2412
2413 static struct attribute_group x86_pmu_caps_group __ro_after_init = {
2414         .name = "caps",
2415         .attrs = x86_pmu_caps_attrs,
2416 };
2417
2418 static const struct attribute_group *x86_pmu_attr_groups[] = {
2419         &x86_pmu_attr_group,
2420         &x86_pmu_format_group,
2421         &x86_pmu_events_group,
2422         &x86_pmu_caps_group,
2423         NULL,
2424 };
2425
2426 static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
2427 {
2428         static_call_cond(x86_pmu_sched_task)(ctx, sched_in);
2429 }
2430
2431 static void x86_pmu_swap_task_ctx(struct perf_event_context *prev,
2432                                   struct perf_event_context *next)
2433 {
2434         static_call_cond(x86_pmu_swap_task_ctx)(prev, next);
2435 }
2436
2437 void perf_check_microcode(void)
2438 {
2439         if (x86_pmu.check_microcode)
2440                 x86_pmu.check_microcode();
2441 }
2442
2443 static int x86_pmu_check_period(struct perf_event *event, u64 value)
2444 {
2445         if (x86_pmu.check_period && x86_pmu.check_period(event, value))
2446                 return -EINVAL;
2447
2448         if (value && x86_pmu.limit_period) {
2449                 if (x86_pmu.limit_period(event, value) > value)
2450                         return -EINVAL;
2451         }
2452
2453         return 0;
2454 }
2455
2456 static int x86_pmu_aux_output_match(struct perf_event *event)
2457 {
2458         if (!(pmu.capabilities & PERF_PMU_CAP_AUX_OUTPUT))
2459                 return 0;
2460
2461         if (x86_pmu.aux_output_match)
2462                 return x86_pmu.aux_output_match(event);
2463
2464         return 0;
2465 }
2466
2467 static struct pmu pmu = {
2468         .pmu_enable             = x86_pmu_enable,
2469         .pmu_disable            = x86_pmu_disable,
2470
2471         .attr_groups            = x86_pmu_attr_groups,
2472
2473         .event_init             = x86_pmu_event_init,
2474
2475         .event_mapped           = x86_pmu_event_mapped,
2476         .event_unmapped         = x86_pmu_event_unmapped,
2477
2478         .add                    = x86_pmu_add,
2479         .del                    = x86_pmu_del,
2480         .start                  = x86_pmu_start,
2481         .stop                   = x86_pmu_stop,
2482         .read                   = x86_pmu_read,
2483
2484         .start_txn              = x86_pmu_start_txn,
2485         .cancel_txn             = x86_pmu_cancel_txn,
2486         .commit_txn             = x86_pmu_commit_txn,
2487
2488         .event_idx              = x86_pmu_event_idx,
2489         .sched_task             = x86_pmu_sched_task,
2490         .swap_task_ctx          = x86_pmu_swap_task_ctx,
2491         .check_period           = x86_pmu_check_period,
2492
2493         .aux_output_match       = x86_pmu_aux_output_match,
2494 };
2495
2496 void arch_perf_update_userpage(struct perf_event *event,
2497                                struct perf_event_mmap_page *userpg, u64 now)
2498 {
2499         struct cyc2ns_data data;
2500         u64 offset;
2501
2502         userpg->cap_user_time = 0;
2503         userpg->cap_user_time_zero = 0;
2504         userpg->cap_user_rdpmc =
2505                 !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
2506         userpg->pmc_width = x86_pmu.cntval_bits;
2507
2508         if (!using_native_sched_clock() || !sched_clock_stable())
2509                 return;
2510
2511         cyc2ns_read_begin(&data);
2512
2513         offset = data.cyc2ns_offset + __sched_clock_offset;
2514
2515         /*
2516          * Internal timekeeping for enabled/running/stopped times
2517          * is always in the local_clock domain.
2518          */
2519         userpg->cap_user_time = 1;
2520         userpg->time_mult = data.cyc2ns_mul;
2521         userpg->time_shift = data.cyc2ns_shift;
2522         userpg->time_offset = offset - now;
2523
2524         /*
2525          * cap_user_time_zero doesn't make sense when we're using a different
2526          * time base for the records.
2527          */
2528         if (!event->attr.use_clockid) {
2529                 userpg->cap_user_time_zero = 1;
2530                 userpg->time_zero = offset;
2531         }
2532
2533         cyc2ns_read_end();
2534 }
2535
2536 /*
2537  * Determine whether the regs were taken from an irq/exception handler rather
2538  * than from perf_arch_fetch_caller_regs().
2539  */
2540 static bool perf_hw_regs(struct pt_regs *regs)
2541 {
2542         return regs->flags & X86_EFLAGS_FIXED;
2543 }
2544
2545 void
2546 perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
2547 {
2548         struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs();
2549         struct unwind_state state;
2550         unsigned long addr;
2551
2552         if (guest_cbs && guest_cbs->is_in_guest()) {
2553                 /* TODO: We don't support guest os callchain now */
2554                 return;
2555         }
2556
2557         if (perf_callchain_store(entry, regs->ip))
2558                 return;
2559
2560         if (perf_hw_regs(regs))
2561                 unwind_start(&state, current, regs, NULL);
2562         else
2563                 unwind_start(&state, current, NULL, (void *)regs->sp);
2564
2565         for (; !unwind_done(&state); unwind_next_frame(&state)) {
2566                 addr = unwind_get_return_address(&state);
2567                 if (!addr || perf_callchain_store(entry, addr))
2568                         return;
2569         }
2570 }
2571
2572 static inline int
2573 valid_user_frame(const void __user *fp, unsigned long size)
2574 {
2575         return (__range_not_ok(fp, size, TASK_SIZE) == 0);
2576 }
2577
2578 static unsigned long get_segment_base(unsigned int segment)
2579 {
2580         struct desc_struct *desc;
2581         unsigned int idx = segment >> 3;
2582
2583         if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
2584 #ifdef CONFIG_MODIFY_LDT_SYSCALL
2585                 struct ldt_struct *ldt;
2586
2587                 /* IRQs are off, so this synchronizes with smp_store_release */
2588                 ldt = READ_ONCE(current->active_mm->context.ldt);
2589                 if (!ldt || idx >= ldt->nr_entries)
2590                         return 0;
2591
2592                 desc = &ldt->entries[idx];
2593 #else
2594                 return 0;
2595 #endif
2596         } else {
2597                 if (idx >= GDT_ENTRIES)
2598                         return 0;
2599
2600                 desc = raw_cpu_ptr(gdt_page.gdt) + idx;
2601         }
2602
2603         return get_desc_base(desc);
2604 }
2605
2606 #ifdef CONFIG_IA32_EMULATION
2607
2608 #include <linux/compat.h>
2609
2610 static inline int
2611 perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
2612 {
2613         /* 32-bit process in 64-bit kernel. */
2614         unsigned long ss_base, cs_base;
2615         struct stack_frame_ia32 frame;
2616         const struct stack_frame_ia32 __user *fp;
2617
2618         if (!test_thread_flag(TIF_IA32))
2619                 return 0;
2620
2621         cs_base = get_segment_base(regs->cs);
2622         ss_base = get_segment_base(regs->ss);
2623
2624         fp = compat_ptr(ss_base + regs->bp);
2625         pagefault_disable();
2626         while (entry->nr < entry->max_stack) {
2627                 if (!valid_user_frame(fp, sizeof(frame)))
2628                         break;
2629
2630                 if (__get_user(frame.next_frame, &fp->next_frame))
2631                         break;
2632                 if (__get_user(frame.return_address, &fp->return_address))
2633                         break;
2634
2635                 perf_callchain_store(entry, cs_base + frame.return_address);
2636                 fp = compat_ptr(ss_base + frame.next_frame);
2637         }
2638         pagefault_enable();
2639         return 1;
2640 }
2641 #else
2642 static inline int
2643 perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
2644 {
2645     return 0;
2646 }
2647 #endif
2648
2649 void
2650 perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
2651 {
2652         struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs();
2653         struct stack_frame frame;
2654         const struct stack_frame __user *fp;
2655
2656         if (guest_cbs && guest_cbs->is_in_guest()) {
2657                 /* TODO: We don't support guest os callchain now */
2658                 return;
2659         }
2660
2661         /*
2662          * We don't know what to do with VM86 stacks.. ignore them for now.
2663          */
2664         if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
2665                 return;
2666
2667         fp = (void __user *)regs->bp;
2668
2669         perf_callchain_store(entry, regs->ip);
2670
2671         if (!nmi_uaccess_okay())
2672                 return;
2673
2674         if (perf_callchain_user32(regs, entry))
2675                 return;
2676
2677         pagefault_disable();
2678         while (entry->nr < entry->max_stack) {
2679                 if (!valid_user_frame(fp, sizeof(frame)))
2680                         break;
2681
2682                 if (__get_user(frame.next_frame, &fp->next_frame))
2683                         break;
2684                 if (__get_user(frame.return_address, &fp->return_address))
2685                         break;
2686
2687                 perf_callchain_store(entry, frame.return_address);
2688                 fp = (void __user *)frame.next_frame;
2689         }
2690         pagefault_enable();
2691 }
2692
2693 /*
2694  * Deal with code segment offsets for the various execution modes:
2695  *
2696  *   VM86 - the good olde 16 bit days, where the linear address is
2697  *          20 bits and we use regs->ip + 0x10 * regs->cs.
2698  *
2699  *   IA32 - Where we need to look at GDT/LDT segment descriptor tables
2700  *          to figure out what the 32bit base address is.
2701  *
2702  *    X32 - has TIF_X32 set, but is running in x86_64
2703  *
2704  * X86_64 - CS,DS,SS,ES are all zero based.
2705  */
2706 static unsigned long code_segment_base(struct pt_regs *regs)
2707 {
2708         /*
2709          * For IA32 we look at the GDT/LDT segment base to convert the
2710          * effective IP to a linear address.
2711          */
2712
2713 #ifdef CONFIG_X86_32
2714         /*
2715          * If we are in VM86 mode, add the segment offset to convert to a
2716          * linear address.
2717          */
2718         if (regs->flags & X86_VM_MASK)
2719                 return 0x10 * regs->cs;
2720
2721         if (user_mode(regs) && regs->cs != __USER_CS)
2722                 return get_segment_base(regs->cs);
2723 #else
2724         if (user_mode(regs) && !user_64bit_mode(regs) &&
2725             regs->cs != __USER32_CS)
2726                 return get_segment_base(regs->cs);
2727 #endif
2728         return 0;
2729 }
2730
2731 unsigned long perf_instruction_pointer(struct pt_regs *regs)
2732 {
2733         struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs();
2734
2735         if (guest_cbs && guest_cbs->is_in_guest())
2736                 return guest_cbs->get_guest_ip();
2737
2738         return regs->ip + code_segment_base(regs);
2739 }
2740
2741 unsigned long perf_misc_flags(struct pt_regs *regs)
2742 {
2743         struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs();
2744         int misc = 0;
2745
2746         if (guest_cbs && guest_cbs->is_in_guest()) {
2747                 if (guest_cbs->is_user_mode())
2748                         misc |= PERF_RECORD_MISC_GUEST_USER;
2749                 else
2750                         misc |= PERF_RECORD_MISC_GUEST_KERNEL;
2751         } else {
2752                 if (user_mode(regs))
2753                         misc |= PERF_RECORD_MISC_USER;
2754                 else
2755                         misc |= PERF_RECORD_MISC_KERNEL;
2756         }
2757
2758         if (regs->flags & PERF_EFLAGS_EXACT)
2759                 misc |= PERF_RECORD_MISC_EXACT_IP;
2760
2761         return misc;
2762 }
2763
2764 void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
2765 {
2766         cap->version            = x86_pmu.version;
2767         cap->num_counters_gp    = x86_pmu.num_counters;
2768         cap->num_counters_fixed = x86_pmu.num_counters_fixed;
2769         cap->bit_width_gp       = x86_pmu.cntval_bits;
2770         cap->bit_width_fixed    = x86_pmu.cntval_bits;
2771         cap->events_mask        = (unsigned int)x86_pmu.events_maskl;
2772         cap->events_mask_len    = x86_pmu.events_mask_len;
2773 }
2774 EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);