1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2023 Rivos Inc
6 * Atish Patra <atishp@rivosinc.com>
9 #define pr_fmt(fmt) "riscv-kvm-pmu: " fmt
10 #include <linux/errno.h>
11 #include <linux/err.h>
12 #include <linux/kvm_host.h>
13 #include <linux/perf/riscv_pmu.h>
15 #include <asm/kvm_vcpu_sbi.h>
16 #include <asm/kvm_vcpu_pmu.h>
17 #include <linux/bitops.h>
19 #define kvm_pmu_num_counters(pmu) ((pmu)->num_hw_ctrs + (pmu)->num_fw_ctrs)
20 #define get_event_type(x) (((x) & SBI_PMU_EVENT_IDX_TYPE_MASK) >> 16)
21 #define get_event_code(x) ((x) & SBI_PMU_EVENT_IDX_CODE_MASK)
23 static enum perf_hw_id hw_event_perf_map[SBI_PMU_HW_GENERAL_MAX] = {
24 [SBI_PMU_HW_CPU_CYCLES] = PERF_COUNT_HW_CPU_CYCLES,
25 [SBI_PMU_HW_INSTRUCTIONS] = PERF_COUNT_HW_INSTRUCTIONS,
26 [SBI_PMU_HW_CACHE_REFERENCES] = PERF_COUNT_HW_CACHE_REFERENCES,
27 [SBI_PMU_HW_CACHE_MISSES] = PERF_COUNT_HW_CACHE_MISSES,
28 [SBI_PMU_HW_BRANCH_INSTRUCTIONS] = PERF_COUNT_HW_BRANCH_INSTRUCTIONS,
29 [SBI_PMU_HW_BRANCH_MISSES] = PERF_COUNT_HW_BRANCH_MISSES,
30 [SBI_PMU_HW_BUS_CYCLES] = PERF_COUNT_HW_BUS_CYCLES,
31 [SBI_PMU_HW_STALLED_CYCLES_FRONTEND] = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND,
32 [SBI_PMU_HW_STALLED_CYCLES_BACKEND] = PERF_COUNT_HW_STALLED_CYCLES_BACKEND,
33 [SBI_PMU_HW_REF_CPU_CYCLES] = PERF_COUNT_HW_REF_CPU_CYCLES,
36 static u64 kvm_pmu_get_sample_period(struct kvm_pmc *pmc)
38 u64 counter_val_mask = GENMASK(pmc->cinfo.width, 0);
41 if (!pmc->counter_val)
42 sample_period = counter_val_mask + 1;
44 sample_period = (-pmc->counter_val) & counter_val_mask;
49 static u32 kvm_pmu_get_perf_event_type(unsigned long eidx)
51 enum sbi_pmu_event_type etype = get_event_type(eidx);
52 u32 type = PERF_TYPE_MAX;
55 case SBI_PMU_EVENT_TYPE_HW:
56 type = PERF_TYPE_HARDWARE;
58 case SBI_PMU_EVENT_TYPE_CACHE:
59 type = PERF_TYPE_HW_CACHE;
61 case SBI_PMU_EVENT_TYPE_RAW:
62 case SBI_PMU_EVENT_TYPE_FW:
72 static bool kvm_pmu_is_fw_event(unsigned long eidx)
74 return get_event_type(eidx) == SBI_PMU_EVENT_TYPE_FW;
77 static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc)
79 if (pmc->perf_event) {
80 perf_event_disable(pmc->perf_event);
81 perf_event_release_kernel(pmc->perf_event);
82 pmc->perf_event = NULL;
86 static u64 kvm_pmu_get_perf_event_hw_config(u32 sbi_event_code)
88 return hw_event_perf_map[sbi_event_code];
91 static u64 kvm_pmu_get_perf_event_cache_config(u32 sbi_event_code)
94 unsigned int cache_type, cache_op, cache_result;
96 /* All the cache event masks lie within 0xFF. No separate masking is necessary */
97 cache_type = (sbi_event_code & SBI_PMU_EVENT_CACHE_ID_CODE_MASK) >>
98 SBI_PMU_EVENT_CACHE_ID_SHIFT;
99 cache_op = (sbi_event_code & SBI_PMU_EVENT_CACHE_OP_ID_CODE_MASK) >>
100 SBI_PMU_EVENT_CACHE_OP_SHIFT;
101 cache_result = sbi_event_code & SBI_PMU_EVENT_CACHE_RESULT_ID_CODE_MASK;
103 if (cache_type >= PERF_COUNT_HW_CACHE_MAX ||
104 cache_op >= PERF_COUNT_HW_CACHE_OP_MAX ||
105 cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
108 config = cache_type | (cache_op << 8) | (cache_result << 16);
113 static u64 kvm_pmu_get_perf_event_config(unsigned long eidx, uint64_t evt_data)
115 enum sbi_pmu_event_type etype = get_event_type(eidx);
116 u32 ecode = get_event_code(eidx);
117 u64 config = U64_MAX;
120 case SBI_PMU_EVENT_TYPE_HW:
121 if (ecode < SBI_PMU_HW_GENERAL_MAX)
122 config = kvm_pmu_get_perf_event_hw_config(ecode);
124 case SBI_PMU_EVENT_TYPE_CACHE:
125 config = kvm_pmu_get_perf_event_cache_config(ecode);
127 case SBI_PMU_EVENT_TYPE_RAW:
128 config = evt_data & RISCV_PMU_RAW_EVENT_MASK;
130 case SBI_PMU_EVENT_TYPE_FW:
131 if (ecode < SBI_PMU_FW_MAX)
132 config = (1ULL << 63) | ecode;
141 static int kvm_pmu_get_fixed_pmc_index(unsigned long eidx)
143 u32 etype = kvm_pmu_get_perf_event_type(eidx);
144 u32 ecode = get_event_code(eidx);
146 if (etype != SBI_PMU_EVENT_TYPE_HW)
149 if (ecode == SBI_PMU_HW_CPU_CYCLES)
151 else if (ecode == SBI_PMU_HW_INSTRUCTIONS)
157 static int kvm_pmu_get_programmable_pmc_index(struct kvm_pmu *kvpmu, unsigned long eidx,
158 unsigned long cbase, unsigned long cmask)
164 if (kvm_pmu_is_fw_event(eidx)) {
165 /* Firmware counters are mapped 1:1 starting from num_hw_ctrs for simplicity */
166 min = kvpmu->num_hw_ctrs;
167 max = min + kvpmu->num_fw_ctrs;
169 /* First 3 counters are reserved for fixed counters */
171 max = kvpmu->num_hw_ctrs;
174 for_each_set_bit(i, &cmask, BITS_PER_LONG) {
176 if ((pmc_idx >= min && pmc_idx < max) &&
177 !test_bit(pmc_idx, kvpmu->pmc_in_use)) {
186 static int pmu_get_pmc_index(struct kvm_pmu *pmu, unsigned long eidx,
187 unsigned long cbase, unsigned long cmask)
191 /* Fixed counters need to be have fixed mapping as they have different width */
192 ret = kvm_pmu_get_fixed_pmc_index(eidx);
196 return kvm_pmu_get_programmable_pmc_index(pmu, eidx, cbase, cmask);
199 static int pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx,
200 unsigned long *out_val)
202 struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
204 u64 enabled, running;
207 pmc = &kvpmu->pmc[cidx];
209 if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) {
210 fevent_code = get_event_code(pmc->event_idx);
211 pmc->counter_val = kvpmu->fw_event[fevent_code].value;
212 } else if (pmc->perf_event) {
213 pmc->counter_val += perf_event_read_value(pmc->perf_event, &enabled, &running);
217 *out_val = pmc->counter_val;
222 static int kvm_pmu_validate_counter_mask(struct kvm_pmu *kvpmu, unsigned long ctr_base,
223 unsigned long ctr_mask)
225 /* Make sure the we have a valid counter mask requested from the caller */
226 if (!ctr_mask || (ctr_base + __fls(ctr_mask) >= kvm_pmu_num_counters(kvpmu)))
232 static int kvm_pmu_create_perf_event(struct kvm_pmc *pmc, struct perf_event_attr *attr,
233 unsigned long flags, unsigned long eidx, unsigned long evtdata)
235 struct perf_event *event;
237 kvm_pmu_release_perf_event(pmc);
238 attr->config = kvm_pmu_get_perf_event_config(eidx, evtdata);
239 if (flags & SBI_PMU_CFG_FLAG_CLEAR_VALUE) {
240 //TODO: Do we really want to clear the value in hardware counter
241 pmc->counter_val = 0;
245 * Set the default sample_period for now. The guest specified value
246 * will be updated in the start call.
248 attr->sample_period = kvm_pmu_get_sample_period(pmc);
250 event = perf_event_create_kernel_counter(attr, -1, current, NULL, pmc);
252 pr_err("kvm pmu event creation failed for eidx %lx: %ld\n", eidx, PTR_ERR(event));
253 return PTR_ERR(event);
256 pmc->perf_event = event;
257 if (flags & SBI_PMU_CFG_FLAG_AUTO_START)
258 perf_event_enable(pmc->perf_event);
263 int kvm_riscv_vcpu_pmu_incr_fw(struct kvm_vcpu *vcpu, unsigned long fid)
265 struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
266 struct kvm_fw_event *fevent;
268 if (!kvpmu || fid >= SBI_PMU_FW_MAX)
271 fevent = &kvpmu->fw_event[fid];
278 int kvm_riscv_vcpu_pmu_read_hpm(struct kvm_vcpu *vcpu, unsigned int csr_num,
279 unsigned long *val, unsigned long new_val,
280 unsigned long wr_mask)
282 struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
283 int cidx, ret = KVM_INSN_CONTINUE_NEXT_SEPC;
285 if (!kvpmu || !kvpmu->init_done) {
287 * In absence of sscofpmf in the platform, the guest OS may use
288 * the legacy PMU driver to read cycle/instret. In that case,
289 * just return 0 to avoid any illegal trap. However, any other
290 * hpmcounter access should result in illegal trap as they must
291 * be access through SBI PMU only.
293 if (csr_num == CSR_CYCLE || csr_num == CSR_INSTRET) {
297 return KVM_INSN_ILLEGAL_TRAP;
301 /* The counter CSR are read only. Thus, any write should result in illegal traps */
303 return KVM_INSN_ILLEGAL_TRAP;
305 cidx = csr_num - CSR_CYCLE;
307 if (pmu_ctr_read(vcpu, cidx, val) < 0)
308 return KVM_INSN_ILLEGAL_TRAP;
313 int kvm_riscv_vcpu_pmu_num_ctrs(struct kvm_vcpu *vcpu,
314 struct kvm_vcpu_sbi_return *retdata)
316 struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
318 retdata->out_val = kvm_pmu_num_counters(kvpmu);
323 int kvm_riscv_vcpu_pmu_ctr_info(struct kvm_vcpu *vcpu, unsigned long cidx,
324 struct kvm_vcpu_sbi_return *retdata)
326 struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
328 if (cidx > RISCV_KVM_MAX_COUNTERS || cidx == 1) {
329 retdata->err_val = SBI_ERR_INVALID_PARAM;
333 retdata->out_val = kvpmu->pmc[cidx].cinfo.value;
338 int kvm_riscv_vcpu_pmu_ctr_start(struct kvm_vcpu *vcpu, unsigned long ctr_base,
339 unsigned long ctr_mask, unsigned long flags, u64 ival,
340 struct kvm_vcpu_sbi_return *retdata)
342 struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
343 int i, pmc_index, sbiret = 0;
347 if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) {
348 sbiret = SBI_ERR_INVALID_PARAM;
352 /* Start the counters that have been configured and requested by the guest */
353 for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) {
354 pmc_index = i + ctr_base;
355 if (!test_bit(pmc_index, kvpmu->pmc_in_use))
357 pmc = &kvpmu->pmc[pmc_index];
358 if (flags & SBI_PMU_START_FLAG_SET_INIT_VALUE)
359 pmc->counter_val = ival;
360 if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) {
361 fevent_code = get_event_code(pmc->event_idx);
362 if (fevent_code >= SBI_PMU_FW_MAX) {
363 sbiret = SBI_ERR_INVALID_PARAM;
367 /* Check if the counter was already started for some reason */
368 if (kvpmu->fw_event[fevent_code].started) {
369 sbiret = SBI_ERR_ALREADY_STARTED;
373 kvpmu->fw_event[fevent_code].started = true;
374 kvpmu->fw_event[fevent_code].value = pmc->counter_val;
375 } else if (pmc->perf_event) {
376 if (unlikely(pmc->started)) {
377 sbiret = SBI_ERR_ALREADY_STARTED;
380 perf_event_period(pmc->perf_event, kvm_pmu_get_sample_period(pmc));
381 perf_event_enable(pmc->perf_event);
384 sbiret = SBI_ERR_INVALID_PARAM;
389 retdata->err_val = sbiret;
394 int kvm_riscv_vcpu_pmu_ctr_stop(struct kvm_vcpu *vcpu, unsigned long ctr_base,
395 unsigned long ctr_mask, unsigned long flags,
396 struct kvm_vcpu_sbi_return *retdata)
398 struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
399 int i, pmc_index, sbiret = 0;
400 u64 enabled, running;
404 if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) {
405 sbiret = SBI_ERR_INVALID_PARAM;
409 /* Stop the counters that have been configured and requested by the guest */
410 for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) {
411 pmc_index = i + ctr_base;
412 if (!test_bit(pmc_index, kvpmu->pmc_in_use))
414 pmc = &kvpmu->pmc[pmc_index];
415 if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) {
416 fevent_code = get_event_code(pmc->event_idx);
417 if (fevent_code >= SBI_PMU_FW_MAX) {
418 sbiret = SBI_ERR_INVALID_PARAM;
422 if (!kvpmu->fw_event[fevent_code].started)
423 sbiret = SBI_ERR_ALREADY_STOPPED;
425 kvpmu->fw_event[fevent_code].started = false;
426 } else if (pmc->perf_event) {
428 /* Stop counting the counter */
429 perf_event_disable(pmc->perf_event);
430 pmc->started = false;
432 sbiret = SBI_ERR_ALREADY_STOPPED;
435 if (flags & SBI_PMU_STOP_FLAG_RESET) {
436 /* Relase the counter if this is a reset request */
437 pmc->counter_val += perf_event_read_value(pmc->perf_event,
439 kvm_pmu_release_perf_event(pmc);
442 sbiret = SBI_ERR_INVALID_PARAM;
444 if (flags & SBI_PMU_STOP_FLAG_RESET) {
445 pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID;
446 clear_bit(pmc_index, kvpmu->pmc_in_use);
451 retdata->err_val = sbiret;
456 int kvm_riscv_vcpu_pmu_ctr_cfg_match(struct kvm_vcpu *vcpu, unsigned long ctr_base,
457 unsigned long ctr_mask, unsigned long flags,
458 unsigned long eidx, u64 evtdata,
459 struct kvm_vcpu_sbi_return *retdata)
461 int ctr_idx, ret, sbiret = 0;
463 unsigned long event_code;
464 u32 etype = kvm_pmu_get_perf_event_type(eidx);
465 struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
466 struct kvm_pmc *pmc = NULL;
467 struct perf_event_attr attr = {
469 .size = sizeof(struct perf_event_attr),
472 * It should never reach here if the platform doesn't support the sscofpmf
473 * extension as mode filtering won't work without it.
475 .exclude_host = true,
477 .exclude_user = !!(flags & SBI_PMU_CFG_FLAG_SET_UINH),
478 .exclude_kernel = !!(flags & SBI_PMU_CFG_FLAG_SET_SINH),
479 .config1 = RISCV_PMU_CONFIG1_GUEST_EVENTS,
482 if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) {
483 sbiret = SBI_ERR_INVALID_PARAM;
487 event_code = get_event_code(eidx);
488 is_fevent = kvm_pmu_is_fw_event(eidx);
489 if (is_fevent && event_code >= SBI_PMU_FW_MAX) {
490 sbiret = SBI_ERR_NOT_SUPPORTED;
495 * SKIP_MATCH flag indicates the caller is aware of the assigned counter
496 * for this event. Just do a sanity check if it already marked used.
498 if (flags & SBI_PMU_CFG_FLAG_SKIP_MATCH) {
499 if (!test_bit(ctr_base + __ffs(ctr_mask), kvpmu->pmc_in_use)) {
500 sbiret = SBI_ERR_FAILURE;
503 ctr_idx = ctr_base + __ffs(ctr_mask);
505 ctr_idx = pmu_get_pmc_index(kvpmu, eidx, ctr_base, ctr_mask);
507 sbiret = SBI_ERR_NOT_SUPPORTED;
512 pmc = &kvpmu->pmc[ctr_idx];
516 if (flags & SBI_PMU_CFG_FLAG_AUTO_START)
517 kvpmu->fw_event[event_code].started = true;
519 ret = kvm_pmu_create_perf_event(pmc, &attr, flags, eidx, evtdata);
524 set_bit(ctr_idx, kvpmu->pmc_in_use);
525 pmc->event_idx = eidx;
526 retdata->out_val = ctr_idx;
528 retdata->err_val = sbiret;
533 int kvm_riscv_vcpu_pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx,
534 struct kvm_vcpu_sbi_return *retdata)
538 ret = pmu_ctr_read(vcpu, cidx, &retdata->out_val);
540 retdata->err_val = SBI_ERR_INVALID_PARAM;
545 void kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu)
547 int i = 0, ret, num_hw_ctrs = 0, hpm_width = 0;
548 struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
552 * PMU functionality should be only available to guests if privilege mode
553 * filtering is available in the host. Otherwise, guest will always count
554 * events while the execution is in hypervisor mode.
556 if (!riscv_isa_extension_available(NULL, SSCOFPMF))
559 ret = riscv_pmu_get_hpm_info(&hpm_width, &num_hw_ctrs);
560 if (ret < 0 || !hpm_width || !num_hw_ctrs)
564 * Increase the number of hardware counters to offset the time counter.
566 kvpmu->num_hw_ctrs = num_hw_ctrs + 1;
567 kvpmu->num_fw_ctrs = SBI_PMU_FW_MAX;
568 memset(&kvpmu->fw_event, 0, SBI_PMU_FW_MAX * sizeof(struct kvm_fw_event));
570 if (kvpmu->num_hw_ctrs > RISCV_KVM_MAX_HW_CTRS) {
571 pr_warn_once("Limiting the hardware counters to 32 as specified by the ISA");
572 kvpmu->num_hw_ctrs = RISCV_KVM_MAX_HW_CTRS;
576 * There is no correlation between the logical hardware counter and virtual counters.
577 * However, we need to encode a hpmcounter CSR in the counter info field so that
578 * KVM can trap n emulate the read. This works well in the migration use case as
579 * KVM doesn't care if the actual hpmcounter is available in the hardware or not.
581 for (i = 0; i < kvm_pmu_num_counters(kvpmu); i++) {
582 /* TIME CSR shouldn't be read from perf interface */
585 pmc = &kvpmu->pmc[i];
587 pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID;
588 if (i < kvpmu->num_hw_ctrs) {
589 pmc->cinfo.type = SBI_PMU_CTR_TYPE_HW;
591 /* CY, IR counters */
592 pmc->cinfo.width = 63;
594 pmc->cinfo.width = hpm_width;
596 * The CSR number doesn't have any relation with the logical
597 * hardware counters. The CSR numbers are encoded sequentially
598 * to avoid maintaining a map between the virtual counter
601 pmc->cinfo.csr = CSR_CYCLE + i;
603 pmc->cinfo.type = SBI_PMU_CTR_TYPE_FW;
604 pmc->cinfo.width = BITS_PER_LONG - 1;
608 kvpmu->init_done = true;
611 void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu)
613 struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
620 for_each_set_bit(i, kvpmu->pmc_in_use, RISCV_MAX_COUNTERS) {
621 pmc = &kvpmu->pmc[i];
622 pmc->counter_val = 0;
623 kvm_pmu_release_perf_event(pmc);
624 pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID;
626 bitmap_zero(kvpmu->pmc_in_use, RISCV_MAX_COUNTERS);
627 memset(&kvpmu->fw_event, 0, SBI_PMU_FW_MAX * sizeof(struct kvm_fw_event));
630 void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu)
632 kvm_riscv_vcpu_pmu_deinit(vcpu);