GNU Linux-libre 6.7.9-gnu
[releases.git] / arch / powerpc / perf / imc-pmu.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * In-Memory Collection (IMC) Performance Monitor counter support.
4  *
5  * Copyright (C) 2017 Madhavan Srinivasan, IBM Corporation.
6  *           (C) 2017 Anju T Sudhakar, IBM Corporation.
7  *           (C) 2017 Hemant K Shaw, IBM Corporation.
8  */
9 #include <linux/of.h>
10 #include <linux/perf_event.h>
11 #include <linux/slab.h>
12 #include <asm/opal.h>
13 #include <asm/imc-pmu.h>
14 #include <asm/cputhreads.h>
15 #include <asm/smp.h>
16 #include <linux/string.h>
17 #include <linux/spinlock.h>
18
19 /* Nest IMC data structures and variables */
20
21 /*
22  * Used to avoid races in counting the nest-pmu units during hotplug
23  * register and unregister
24  */
25 static DEFINE_MUTEX(nest_init_lock);
26 static DEFINE_PER_CPU(struct imc_pmu_ref *, local_nest_imc_refc);
27 static struct imc_pmu **per_nest_pmu_arr;
28 static cpumask_t nest_imc_cpumask;
29 static struct imc_pmu_ref *nest_imc_refc;
30 static int nest_pmus;
31
32 /* Core IMC data structures and variables */
33
34 static cpumask_t core_imc_cpumask;
35 static struct imc_pmu_ref *core_imc_refc;
36 static struct imc_pmu *core_imc_pmu;
37
38 /* Thread IMC data structures and variables */
39
40 static DEFINE_PER_CPU(u64 *, thread_imc_mem);
41 static struct imc_pmu *thread_imc_pmu;
42 static int thread_imc_mem_size;
43
44 /* Trace IMC data structures */
45 static DEFINE_PER_CPU(u64 *, trace_imc_mem);
46 static struct imc_pmu_ref *trace_imc_refc;
47 static int trace_imc_mem_size;
48
49 /*
50  * Global data structure used to avoid races between thread,
51  * core and trace-imc
52  */
53 static struct imc_pmu_ref imc_global_refc = {
54         .lock = __SPIN_LOCK_UNLOCKED(imc_global_refc.lock),
55         .id = 0,
56         .refc = 0,
57 };
58
59 static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
60 {
61         return container_of(event->pmu, struct imc_pmu, pmu);
62 }
63
64 PMU_FORMAT_ATTR(event, "config:0-61");
65 PMU_FORMAT_ATTR(offset, "config:0-31");
66 PMU_FORMAT_ATTR(rvalue, "config:32");
67 PMU_FORMAT_ATTR(mode, "config:33-40");
68 static struct attribute *imc_format_attrs[] = {
69         &format_attr_event.attr,
70         &format_attr_offset.attr,
71         &format_attr_rvalue.attr,
72         &format_attr_mode.attr,
73         NULL,
74 };
75
76 static const struct attribute_group imc_format_group = {
77         .name = "format",
78         .attrs = imc_format_attrs,
79 };
80
81 /* Format attribute for imc trace-mode */
82 PMU_FORMAT_ATTR(cpmc_reserved, "config:0-19");
83 PMU_FORMAT_ATTR(cpmc_event, "config:20-27");
84 PMU_FORMAT_ATTR(cpmc_samplesel, "config:28-29");
85 PMU_FORMAT_ATTR(cpmc_load, "config:30-61");
86 static struct attribute *trace_imc_format_attrs[] = {
87         &format_attr_event.attr,
88         &format_attr_cpmc_reserved.attr,
89         &format_attr_cpmc_event.attr,
90         &format_attr_cpmc_samplesel.attr,
91         &format_attr_cpmc_load.attr,
92         NULL,
93 };
94
95 static const struct attribute_group trace_imc_format_group = {
96 .name = "format",
97 .attrs = trace_imc_format_attrs,
98 };
99
100 /* Get the cpumask printed to a buffer "buf" */
101 static ssize_t imc_pmu_cpumask_get_attr(struct device *dev,
102                                         struct device_attribute *attr,
103                                         char *buf)
104 {
105         struct pmu *pmu = dev_get_drvdata(dev);
106         struct imc_pmu *imc_pmu = container_of(pmu, struct imc_pmu, pmu);
107         cpumask_t *active_mask;
108
109         switch(imc_pmu->domain){
110         case IMC_DOMAIN_NEST:
111                 active_mask = &nest_imc_cpumask;
112                 break;
113         case IMC_DOMAIN_CORE:
114                 active_mask = &core_imc_cpumask;
115                 break;
116         default:
117                 return 0;
118         }
119
120         return cpumap_print_to_pagebuf(true, buf, active_mask);
121 }
122
123 static DEVICE_ATTR(cpumask, S_IRUGO, imc_pmu_cpumask_get_attr, NULL);
124
125 static struct attribute *imc_pmu_cpumask_attrs[] = {
126         &dev_attr_cpumask.attr,
127         NULL,
128 };
129
130 static const struct attribute_group imc_pmu_cpumask_attr_group = {
131         .attrs = imc_pmu_cpumask_attrs,
132 };
133
134 /* device_str_attr_create : Populate event "name" and string "str" in attribute */
135 static struct attribute *device_str_attr_create(const char *name, const char *str)
136 {
137         struct perf_pmu_events_attr *attr;
138
139         attr = kzalloc(sizeof(*attr), GFP_KERNEL);
140         if (!attr)
141                 return NULL;
142         sysfs_attr_init(&attr->attr.attr);
143
144         attr->event_str = str;
145         attr->attr.attr.name = name;
146         attr->attr.attr.mode = 0444;
147         attr->attr.show = perf_event_sysfs_show;
148
149         return &attr->attr.attr;
150 }
151
152 static int imc_parse_event(struct device_node *np, const char *scale,
153                                   const char *unit, const char *prefix,
154                                   u32 base, struct imc_events *event)
155 {
156         const char *s;
157         u32 reg;
158
159         if (of_property_read_u32(np, "reg", &reg))
160                 goto error;
161         /* Add the base_reg value to the "reg" */
162         event->value = base + reg;
163
164         if (of_property_read_string(np, "event-name", &s))
165                 goto error;
166
167         event->name = kasprintf(GFP_KERNEL, "%s%s", prefix, s);
168         if (!event->name)
169                 goto error;
170
171         if (of_property_read_string(np, "scale", &s))
172                 s = scale;
173
174         if (s) {
175                 event->scale = kstrdup(s, GFP_KERNEL);
176                 if (!event->scale)
177                         goto error;
178         }
179
180         if (of_property_read_string(np, "unit", &s))
181                 s = unit;
182
183         if (s) {
184                 event->unit = kstrdup(s, GFP_KERNEL);
185                 if (!event->unit)
186                         goto error;
187         }
188
189         return 0;
190 error:
191         kfree(event->unit);
192         kfree(event->scale);
193         kfree(event->name);
194         return -EINVAL;
195 }
196
197 /*
198  * imc_free_events: Function to cleanup the events list, having
199  *                  "nr_entries".
200  */
201 static void imc_free_events(struct imc_events *events, int nr_entries)
202 {
203         int i;
204
205         /* Nothing to clean, return */
206         if (!events)
207                 return;
208         for (i = 0; i < nr_entries; i++) {
209                 kfree(events[i].unit);
210                 kfree(events[i].scale);
211                 kfree(events[i].name);
212         }
213
214         kfree(events);
215 }
216
217 /*
218  * update_events_in_group: Update the "events" information in an attr_group
219  *                         and assign the attr_group to the pmu "pmu".
220  */
221 static int update_events_in_group(struct device_node *node, struct imc_pmu *pmu)
222 {
223         struct attribute_group *attr_group;
224         struct attribute **attrs, *dev_str;
225         struct device_node *np, *pmu_events;
226         u32 handle, base_reg;
227         int i = 0, j = 0, ct, ret;
228         const char *prefix, *g_scale, *g_unit;
229         const char *ev_val_str, *ev_scale_str, *ev_unit_str;
230
231         if (!of_property_read_u32(node, "events", &handle))
232                 pmu_events = of_find_node_by_phandle(handle);
233         else
234                 return 0;
235
236         /* Did not find any node with a given phandle */
237         if (!pmu_events)
238                 return 0;
239
240         /* Get a count of number of child nodes */
241         ct = of_get_child_count(pmu_events);
242
243         /* Get the event prefix */
244         if (of_property_read_string(node, "events-prefix", &prefix)) {
245                 of_node_put(pmu_events);
246                 return 0;
247         }
248
249         /* Get a global unit and scale data if available */
250         if (of_property_read_string(node, "scale", &g_scale))
251                 g_scale = NULL;
252
253         if (of_property_read_string(node, "unit", &g_unit))
254                 g_unit = NULL;
255
256         /* "reg" property gives out the base offset of the counters data */
257         of_property_read_u32(node, "reg", &base_reg);
258
259         /* Allocate memory for the events */
260         pmu->events = kcalloc(ct, sizeof(struct imc_events), GFP_KERNEL);
261         if (!pmu->events) {
262                 of_node_put(pmu_events);
263                 return -ENOMEM;
264         }
265
266         ct = 0;
267         /* Parse the events and update the struct */
268         for_each_child_of_node(pmu_events, np) {
269                 ret = imc_parse_event(np, g_scale, g_unit, prefix, base_reg, &pmu->events[ct]);
270                 if (!ret)
271                         ct++;
272         }
273
274         of_node_put(pmu_events);
275
276         /* Allocate memory for attribute group */
277         attr_group = kzalloc(sizeof(*attr_group), GFP_KERNEL);
278         if (!attr_group) {
279                 imc_free_events(pmu->events, ct);
280                 return -ENOMEM;
281         }
282
283         /*
284          * Allocate memory for attributes.
285          * Since we have count of events for this pmu, we also allocate
286          * memory for the scale and unit attribute for now.
287          * "ct" has the total event structs added from the events-parent node.
288          * So allocate three times the "ct" (this includes event, event_scale and
289          * event_unit).
290          */
291         attrs = kcalloc(((ct * 3) + 1), sizeof(struct attribute *), GFP_KERNEL);
292         if (!attrs) {
293                 kfree(attr_group);
294                 imc_free_events(pmu->events, ct);
295                 return -ENOMEM;
296         }
297
298         attr_group->name = "events";
299         attr_group->attrs = attrs;
300         do {
301                 ev_val_str = kasprintf(GFP_KERNEL, "event=0x%x", pmu->events[i].value);
302                 if (!ev_val_str)
303                         continue;
304                 dev_str = device_str_attr_create(pmu->events[i].name, ev_val_str);
305                 if (!dev_str)
306                         continue;
307
308                 attrs[j++] = dev_str;
309                 if (pmu->events[i].scale) {
310                         ev_scale_str = kasprintf(GFP_KERNEL, "%s.scale", pmu->events[i].name);
311                         if (!ev_scale_str)
312                                 continue;
313                         dev_str = device_str_attr_create(ev_scale_str, pmu->events[i].scale);
314                         if (!dev_str)
315                                 continue;
316
317                         attrs[j++] = dev_str;
318                 }
319
320                 if (pmu->events[i].unit) {
321                         ev_unit_str = kasprintf(GFP_KERNEL, "%s.unit", pmu->events[i].name);
322                         if (!ev_unit_str)
323                                 continue;
324                         dev_str = device_str_attr_create(ev_unit_str, pmu->events[i].unit);
325                         if (!dev_str)
326                                 continue;
327
328                         attrs[j++] = dev_str;
329                 }
330         } while (++i < ct);
331
332         /* Save the event attribute */
333         pmu->attr_groups[IMC_EVENT_ATTR] = attr_group;
334
335         return 0;
336 }
337
338 /* get_nest_pmu_ref: Return the imc_pmu_ref struct for the given node */
339 static struct imc_pmu_ref *get_nest_pmu_ref(int cpu)
340 {
341         return per_cpu(local_nest_imc_refc, cpu);
342 }
343
344 static void nest_change_cpu_context(int old_cpu, int new_cpu)
345 {
346         struct imc_pmu **pn = per_nest_pmu_arr;
347
348         if (old_cpu < 0 || new_cpu < 0)
349                 return;
350
351         while (*pn) {
352                 perf_pmu_migrate_context(&(*pn)->pmu, old_cpu, new_cpu);
353                 pn++;
354         }
355 }
356
357 static int ppc_nest_imc_cpu_offline(unsigned int cpu)
358 {
359         int nid, target = -1;
360         const struct cpumask *l_cpumask;
361         struct imc_pmu_ref *ref;
362
363         /*
364          * Check in the designated list for this cpu. Dont bother
365          * if not one of them.
366          */
367         if (!cpumask_test_and_clear_cpu(cpu, &nest_imc_cpumask))
368                 return 0;
369
370         /*
371          * Check whether nest_imc is registered. We could end up here if the
372          * cpuhotplug callback registration fails. i.e, callback invokes the
373          * offline path for all successfully registered nodes. At this stage,
374          * nest_imc pmu will not be registered and we should return here.
375          *
376          * We return with a zero since this is not an offline failure. And
377          * cpuhp_setup_state() returns the actual failure reason to the caller,
378          * which in turn will call the cleanup routine.
379          */
380         if (!nest_pmus)
381                 return 0;
382
383         /*
384          * Now that this cpu is one of the designated,
385          * find a next cpu a) which is online and b) in same chip.
386          */
387         nid = cpu_to_node(cpu);
388         l_cpumask = cpumask_of_node(nid);
389         target = cpumask_last(l_cpumask);
390
391         /*
392          * If this(target) is the last cpu in the cpumask for this chip,
393          * check for any possible online cpu in the chip.
394          */
395         if (unlikely(target == cpu))
396                 target = cpumask_any_but(l_cpumask, cpu);
397
398         /*
399          * Update the cpumask with the target cpu and
400          * migrate the context if needed
401          */
402         if (target >= 0 && target < nr_cpu_ids) {
403                 cpumask_set_cpu(target, &nest_imc_cpumask);
404                 nest_change_cpu_context(cpu, target);
405         } else {
406                 opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST,
407                                        get_hard_smp_processor_id(cpu));
408                 /*
409                  * If this is the last cpu in this chip then, skip the reference
410                  * count lock and make the reference count on this chip zero.
411                  */
412                 ref = get_nest_pmu_ref(cpu);
413                 if (!ref)
414                         return -EINVAL;
415
416                 ref->refc = 0;
417         }
418         return 0;
419 }
420
421 static int ppc_nest_imc_cpu_online(unsigned int cpu)
422 {
423         const struct cpumask *l_cpumask;
424         static struct cpumask tmp_mask;
425         int res;
426
427         /* Get the cpumask of this node */
428         l_cpumask = cpumask_of_node(cpu_to_node(cpu));
429
430         /*
431          * If this is not the first online CPU on this node, then
432          * just return.
433          */
434         if (cpumask_and(&tmp_mask, l_cpumask, &nest_imc_cpumask))
435                 return 0;
436
437         /*
438          * If this is the first online cpu on this node
439          * disable the nest counters by making an OPAL call.
440          */
441         res = opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST,
442                                      get_hard_smp_processor_id(cpu));
443         if (res)
444                 return res;
445
446         /* Make this CPU the designated target for counter collection */
447         cpumask_set_cpu(cpu, &nest_imc_cpumask);
448         return 0;
449 }
450
451 static int nest_pmu_cpumask_init(void)
452 {
453         return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE,
454                                  "perf/powerpc/imc:online",
455                                  ppc_nest_imc_cpu_online,
456                                  ppc_nest_imc_cpu_offline);
457 }
458
459 static void nest_imc_counters_release(struct perf_event *event)
460 {
461         int rc, node_id;
462         struct imc_pmu_ref *ref;
463
464         if (event->cpu < 0)
465                 return;
466
467         node_id = cpu_to_node(event->cpu);
468
469         /*
470          * See if we need to disable the nest PMU.
471          * If no events are currently in use, then we have to take a
472          * lock to ensure that we don't race with another task doing
473          * enable or disable the nest counters.
474          */
475         ref = get_nest_pmu_ref(event->cpu);
476         if (!ref)
477                 return;
478
479         /* Take the lock for this node and then decrement the reference count */
480         spin_lock(&ref->lock);
481         if (ref->refc == 0) {
482                 /*
483                  * The scenario where this is true is, when perf session is
484                  * started, followed by offlining of all cpus in a given node.
485                  *
486                  * In the cpuhotplug offline path, ppc_nest_imc_cpu_offline()
487                  * function set the ref->count to zero, if the cpu which is
488                  * about to offline is the last cpu in a given node and make
489                  * an OPAL call to disable the engine in that node.
490                  *
491                  */
492                 spin_unlock(&ref->lock);
493                 return;
494         }
495         ref->refc--;
496         if (ref->refc == 0) {
497                 rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST,
498                                             get_hard_smp_processor_id(event->cpu));
499                 if (rc) {
500                         spin_unlock(&ref->lock);
501                         pr_err("nest-imc: Unable to stop the counters for core %d\n", node_id);
502                         return;
503                 }
504         } else if (ref->refc < 0) {
505                 WARN(1, "nest-imc: Invalid event reference count\n");
506                 ref->refc = 0;
507         }
508         spin_unlock(&ref->lock);
509 }
510
511 static int nest_imc_event_init(struct perf_event *event)
512 {
513         int chip_id, rc, node_id;
514         u32 l_config, config = event->attr.config;
515         struct imc_mem_info *pcni;
516         struct imc_pmu *pmu;
517         struct imc_pmu_ref *ref;
518         bool flag = false;
519
520         if (event->attr.type != event->pmu->type)
521                 return -ENOENT;
522
523         /* Sampling not supported */
524         if (event->hw.sample_period)
525                 return -EINVAL;
526
527         if (event->cpu < 0)
528                 return -EINVAL;
529
530         pmu = imc_event_to_pmu(event);
531
532         /* Sanity check for config (event offset) */
533         if ((config & IMC_EVENT_OFFSET_MASK) > pmu->counter_mem_size)
534                 return -EINVAL;
535
536         /*
537          * Nest HW counter memory resides in a per-chip reserve-memory (HOMER).
538          * Get the base memory address for this cpu.
539          */
540         chip_id = cpu_to_chip_id(event->cpu);
541
542         /* Return, if chip_id is not valid */
543         if (chip_id < 0)
544                 return -ENODEV;
545
546         pcni = pmu->mem_info;
547         do {
548                 if (pcni->id == chip_id) {
549                         flag = true;
550                         break;
551                 }
552                 pcni++;
553         } while (pcni->vbase);
554
555         if (!flag)
556                 return -ENODEV;
557
558         /*
559          * Add the event offset to the base address.
560          */
561         l_config = config & IMC_EVENT_OFFSET_MASK;
562         event->hw.event_base = (u64)pcni->vbase + l_config;
563         node_id = cpu_to_node(event->cpu);
564
565         /*
566          * Get the imc_pmu_ref struct for this node.
567          * Take the lock and then increment the count of nest pmu events inited.
568          */
569         ref = get_nest_pmu_ref(event->cpu);
570         if (!ref)
571                 return -EINVAL;
572
573         spin_lock(&ref->lock);
574         if (ref->refc == 0) {
575                 rc = opal_imc_counters_start(OPAL_IMC_COUNTERS_NEST,
576                                              get_hard_smp_processor_id(event->cpu));
577                 if (rc) {
578                         spin_unlock(&ref->lock);
579                         pr_err("nest-imc: Unable to start the counters for node %d\n",
580                                                                         node_id);
581                         return rc;
582                 }
583         }
584         ++ref->refc;
585         spin_unlock(&ref->lock);
586
587         event->destroy = nest_imc_counters_release;
588         return 0;
589 }
590
591 /*
592  * core_imc_mem_init : Initializes memory for the current core.
593  *
594  * Uses alloc_pages_node() and uses the returned address as an argument to
595  * an opal call to configure the pdbar. The address sent as an argument is
596  * converted to physical address before the opal call is made. This is the
597  * base address at which the core imc counters are populated.
598  */
599 static int core_imc_mem_init(int cpu, int size)
600 {
601         int nid, rc = 0, core_id = (cpu / threads_per_core);
602         struct imc_mem_info *mem_info;
603         struct page *page;
604
605         /*
606          * alloc_pages_node() will allocate memory for core in the
607          * local node only.
608          */
609         nid = cpu_to_node(cpu);
610         mem_info = &core_imc_pmu->mem_info[core_id];
611         mem_info->id = core_id;
612
613         /* We need only vbase for core counters */
614         page = alloc_pages_node(nid,
615                                 GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE |
616                                 __GFP_NOWARN, get_order(size));
617         if (!page)
618                 return -ENOMEM;
619         mem_info->vbase = page_address(page);
620
621         core_imc_refc[core_id].id = core_id;
622         spin_lock_init(&core_imc_refc[core_id].lock);
623
624         rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_CORE,
625                                 __pa((void *)mem_info->vbase),
626                                 get_hard_smp_processor_id(cpu));
627         if (rc) {
628                 free_pages((u64)mem_info->vbase, get_order(size));
629                 mem_info->vbase = NULL;
630         }
631
632         return rc;
633 }
634
635 static bool is_core_imc_mem_inited(int cpu)
636 {
637         struct imc_mem_info *mem_info;
638         int core_id = (cpu / threads_per_core);
639
640         mem_info = &core_imc_pmu->mem_info[core_id];
641         if (!mem_info->vbase)
642                 return false;
643
644         return true;
645 }
646
647 static int ppc_core_imc_cpu_online(unsigned int cpu)
648 {
649         const struct cpumask *l_cpumask;
650         static struct cpumask tmp_mask;
651         int ret = 0;
652
653         /* Get the cpumask for this core */
654         l_cpumask = cpu_sibling_mask(cpu);
655
656         /* If a cpu for this core is already set, then, don't do anything */
657         if (cpumask_and(&tmp_mask, l_cpumask, &core_imc_cpumask))
658                 return 0;
659
660         if (!is_core_imc_mem_inited(cpu)) {
661                 ret = core_imc_mem_init(cpu, core_imc_pmu->counter_mem_size);
662                 if (ret) {
663                         pr_info("core_imc memory allocation for cpu %d failed\n", cpu);
664                         return ret;
665                 }
666         }
667
668         /* set the cpu in the mask */
669         cpumask_set_cpu(cpu, &core_imc_cpumask);
670         return 0;
671 }
672
673 static int ppc_core_imc_cpu_offline(unsigned int cpu)
674 {
675         unsigned int core_id;
676         int ncpu;
677         struct imc_pmu_ref *ref;
678
679         /*
680          * clear this cpu out of the mask, if not present in the mask,
681          * don't bother doing anything.
682          */
683         if (!cpumask_test_and_clear_cpu(cpu, &core_imc_cpumask))
684                 return 0;
685
686         /*
687          * Check whether core_imc is registered. We could end up here
688          * if the cpuhotplug callback registration fails. i.e, callback
689          * invokes the offline path for all successfully registered cpus.
690          * At this stage, core_imc pmu will not be registered and we
691          * should return here.
692          *
693          * We return with a zero since this is not an offline failure.
694          * And cpuhp_setup_state() returns the actual failure reason
695          * to the caller, which inturn will call the cleanup routine.
696          */
697         if (!core_imc_pmu->pmu.event_init)
698                 return 0;
699
700         /* Find any online cpu in that core except the current "cpu" */
701         ncpu = cpumask_last(cpu_sibling_mask(cpu));
702
703         if (unlikely(ncpu == cpu))
704                 ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu);
705
706         if (ncpu >= 0 && ncpu < nr_cpu_ids) {
707                 cpumask_set_cpu(ncpu, &core_imc_cpumask);
708                 perf_pmu_migrate_context(&core_imc_pmu->pmu, cpu, ncpu);
709         } else {
710                 /*
711                  * If this is the last cpu in this core then skip taking reference
712                  * count lock for this core and directly zero "refc" for this core.
713                  */
714                 opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE,
715                                        get_hard_smp_processor_id(cpu));
716                 core_id = cpu / threads_per_core;
717                 ref = &core_imc_refc[core_id];
718                 if (!ref)
719                         return -EINVAL;
720
721                 ref->refc = 0;
722                 /*
723                  * Reduce the global reference count, if this is the
724                  * last cpu in this core and core-imc event running
725                  * in this cpu.
726                  */
727                 spin_lock(&imc_global_refc.lock);
728                 if (imc_global_refc.id == IMC_DOMAIN_CORE)
729                         imc_global_refc.refc--;
730
731                 spin_unlock(&imc_global_refc.lock);
732         }
733         return 0;
734 }
735
736 static int core_imc_pmu_cpumask_init(void)
737 {
738         return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE,
739                                  "perf/powerpc/imc_core:online",
740                                  ppc_core_imc_cpu_online,
741                                  ppc_core_imc_cpu_offline);
742 }
743
744 static void reset_global_refc(struct perf_event *event)
745 {
746                 spin_lock(&imc_global_refc.lock);
747                 imc_global_refc.refc--;
748
749                 /*
750                  * If no other thread is running any
751                  * event for this domain(thread/core/trace),
752                  * set the global id to zero.
753                  */
754                 if (imc_global_refc.refc <= 0) {
755                         imc_global_refc.refc = 0;
756                         imc_global_refc.id = 0;
757                 }
758                 spin_unlock(&imc_global_refc.lock);
759 }
760
761 static void core_imc_counters_release(struct perf_event *event)
762 {
763         int rc, core_id;
764         struct imc_pmu_ref *ref;
765
766         if (event->cpu < 0)
767                 return;
768         /*
769          * See if we need to disable the IMC PMU.
770          * If no events are currently in use, then we have to take a
771          * lock to ensure that we don't race with another task doing
772          * enable or disable the core counters.
773          */
774         core_id = event->cpu / threads_per_core;
775
776         /* Take the lock and decrement the refernce count for this core */
777         ref = &core_imc_refc[core_id];
778         if (!ref)
779                 return;
780
781         spin_lock(&ref->lock);
782         if (ref->refc == 0) {
783                 /*
784                  * The scenario where this is true is, when perf session is
785                  * started, followed by offlining of all cpus in a given core.
786                  *
787                  * In the cpuhotplug offline path, ppc_core_imc_cpu_offline()
788                  * function set the ref->count to zero, if the cpu which is
789                  * about to offline is the last cpu in a given core and make
790                  * an OPAL call to disable the engine in that core.
791                  *
792                  */
793                 spin_unlock(&ref->lock);
794                 return;
795         }
796         ref->refc--;
797         if (ref->refc == 0) {
798                 rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE,
799                                             get_hard_smp_processor_id(event->cpu));
800                 if (rc) {
801                         spin_unlock(&ref->lock);
802                         pr_err("IMC: Unable to stop the counters for core %d\n", core_id);
803                         return;
804                 }
805         } else if (ref->refc < 0) {
806                 WARN(1, "core-imc: Invalid event reference count\n");
807                 ref->refc = 0;
808         }
809         spin_unlock(&ref->lock);
810
811         reset_global_refc(event);
812 }
813
814 static int core_imc_event_init(struct perf_event *event)
815 {
816         int core_id, rc;
817         u64 config = event->attr.config;
818         struct imc_mem_info *pcmi;
819         struct imc_pmu *pmu;
820         struct imc_pmu_ref *ref;
821
822         if (event->attr.type != event->pmu->type)
823                 return -ENOENT;
824
825         /* Sampling not supported */
826         if (event->hw.sample_period)
827                 return -EINVAL;
828
829         if (event->cpu < 0)
830                 return -EINVAL;
831
832         event->hw.idx = -1;
833         pmu = imc_event_to_pmu(event);
834
835         /* Sanity check for config (event offset) */
836         if (((config & IMC_EVENT_OFFSET_MASK) > pmu->counter_mem_size))
837                 return -EINVAL;
838
839         if (!is_core_imc_mem_inited(event->cpu))
840                 return -ENODEV;
841
842         core_id = event->cpu / threads_per_core;
843         pcmi = &core_imc_pmu->mem_info[core_id];
844         if ((!pcmi->vbase))
845                 return -ENODEV;
846
847         ref = &core_imc_refc[core_id];
848         if (!ref)
849                 return -EINVAL;
850
851         /*
852          * Core pmu units are enabled only when it is used.
853          * See if this is triggered for the first time.
854          * If yes, take the lock and enable the core counters.
855          * If not, just increment the count in core_imc_refc struct.
856          */
857         spin_lock(&ref->lock);
858         if (ref->refc == 0) {
859                 rc = opal_imc_counters_start(OPAL_IMC_COUNTERS_CORE,
860                                              get_hard_smp_processor_id(event->cpu));
861                 if (rc) {
862                         spin_unlock(&ref->lock);
863                         pr_err("core-imc: Unable to start the counters for core %d\n",
864                                                                         core_id);
865                         return rc;
866                 }
867         }
868         ++ref->refc;
869         spin_unlock(&ref->lock);
870
871         /*
872          * Since the system can run either in accumulation or trace-mode
873          * of IMC at a time, core-imc events are allowed only if no other
874          * trace/thread imc events are enabled/monitored.
875          *
876          * Take the global lock, and check the refc.id
877          * to know whether any other trace/thread imc
878          * events are running.
879          */
880         spin_lock(&imc_global_refc.lock);
881         if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_CORE) {
882                 /*
883                  * No other trace/thread imc events are running in
884                  * the system, so set the refc.id to core-imc.
885                  */
886                 imc_global_refc.id = IMC_DOMAIN_CORE;
887                 imc_global_refc.refc++;
888         } else {
889                 spin_unlock(&imc_global_refc.lock);
890                 return -EBUSY;
891         }
892         spin_unlock(&imc_global_refc.lock);
893
894         event->hw.event_base = (u64)pcmi->vbase + (config & IMC_EVENT_OFFSET_MASK);
895         event->destroy = core_imc_counters_release;
896         return 0;
897 }
898
899 /*
900  * Allocates a page of memory for each of the online cpus, and load
901  * LDBAR with 0.
902  * The physical base address of the page allocated for a cpu will be
903  * written to the LDBAR for that cpu, when the thread-imc event
904  * is added.
905  *
906  * LDBAR Register Layout:
907  *
908  *  0          4         8         12        16        20        24        28
909  * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
910  *   | |       [   ]    [                   Counter Address [8:50]
911  *   | * Mode    |
912  *   |           * PB Scope
913  *   * Enable/Disable
914  *
915  *  32        36        40        44        48        52        56        60
916  * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
917  *           Counter Address [8:50]              ]
918  *
919  */
920 static int thread_imc_mem_alloc(int cpu_id, int size)
921 {
922         u64 *local_mem = per_cpu(thread_imc_mem, cpu_id);
923         int nid = cpu_to_node(cpu_id);
924
925         if (!local_mem) {
926                 struct page *page;
927                 /*
928                  * This case could happen only once at start, since we dont
929                  * free the memory in cpu offline path.
930                  */
931                 page = alloc_pages_node(nid,
932                                   GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE |
933                                   __GFP_NOWARN, get_order(size));
934                 if (!page)
935                         return -ENOMEM;
936                 local_mem = page_address(page);
937
938                 per_cpu(thread_imc_mem, cpu_id) = local_mem;
939         }
940
941         mtspr(SPRN_LDBAR, 0);
942         return 0;
943 }
944
945 static int ppc_thread_imc_cpu_online(unsigned int cpu)
946 {
947         return thread_imc_mem_alloc(cpu, thread_imc_mem_size);
948 }
949
950 static int ppc_thread_imc_cpu_offline(unsigned int cpu)
951 {
952         /*
953          * Set the bit 0 of LDBAR to zero.
954          *
955          * If bit 0 of LDBAR is unset, it will stop posting
956          * the counter data to memory.
957          * For thread-imc, bit 0 of LDBAR will be set to 1 in the
958          * event_add function. So reset this bit here, to stop the updates
959          * to memory in the cpu_offline path.
960          */
961         mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) & (~(1UL << 63))));
962
963         /* Reduce the refc if thread-imc event running on this cpu */
964         spin_lock(&imc_global_refc.lock);
965         if (imc_global_refc.id == IMC_DOMAIN_THREAD)
966                 imc_global_refc.refc--;
967         spin_unlock(&imc_global_refc.lock);
968
969         return 0;
970 }
971
972 static int thread_imc_cpu_init(void)
973 {
974         return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE,
975                           "perf/powerpc/imc_thread:online",
976                           ppc_thread_imc_cpu_online,
977                           ppc_thread_imc_cpu_offline);
978 }
979
980 static int thread_imc_event_init(struct perf_event *event)
981 {
982         u32 config = event->attr.config;
983         struct task_struct *target;
984         struct imc_pmu *pmu;
985
986         if (event->attr.type != event->pmu->type)
987                 return -ENOENT;
988
989         if (!perfmon_capable())
990                 return -EACCES;
991
992         /* Sampling not supported */
993         if (event->hw.sample_period)
994                 return -EINVAL;
995
996         event->hw.idx = -1;
997         pmu = imc_event_to_pmu(event);
998
999         /* Sanity check for config offset */
1000         if (((config & IMC_EVENT_OFFSET_MASK) > pmu->counter_mem_size))
1001                 return -EINVAL;
1002
1003         target = event->hw.target;
1004         if (!target)
1005                 return -EINVAL;
1006
1007         spin_lock(&imc_global_refc.lock);
1008         /*
1009          * Check if any other trace/core imc events are running in the
1010          * system, if not set the global id to thread-imc.
1011          */
1012         if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_THREAD) {
1013                 imc_global_refc.id = IMC_DOMAIN_THREAD;
1014                 imc_global_refc.refc++;
1015         } else {
1016                 spin_unlock(&imc_global_refc.lock);
1017                 return -EBUSY;
1018         }
1019         spin_unlock(&imc_global_refc.lock);
1020
1021         event->pmu->task_ctx_nr = perf_sw_context;
1022         event->destroy = reset_global_refc;
1023         return 0;
1024 }
1025
1026 static bool is_thread_imc_pmu(struct perf_event *event)
1027 {
1028         if (!strncmp(event->pmu->name, "thread_imc", strlen("thread_imc")))
1029                 return true;
1030
1031         return false;
1032 }
1033
1034 static __be64 *get_event_base_addr(struct perf_event *event)
1035 {
1036         u64 addr;
1037
1038         if (is_thread_imc_pmu(event)) {
1039                 addr = (u64)per_cpu(thread_imc_mem, smp_processor_id());
1040                 return (__be64 *)(addr + (event->attr.config & IMC_EVENT_OFFSET_MASK));
1041         }
1042
1043         return (__be64 *)event->hw.event_base;
1044 }
1045
1046 static void thread_imc_pmu_start_txn(struct pmu *pmu,
1047                                      unsigned int txn_flags)
1048 {
1049         if (txn_flags & ~PERF_PMU_TXN_ADD)
1050                 return;
1051         perf_pmu_disable(pmu);
1052 }
1053
1054 static void thread_imc_pmu_cancel_txn(struct pmu *pmu)
1055 {
1056         perf_pmu_enable(pmu);
1057 }
1058
1059 static int thread_imc_pmu_commit_txn(struct pmu *pmu)
1060 {
1061         perf_pmu_enable(pmu);
1062         return 0;
1063 }
1064
1065 static u64 imc_read_counter(struct perf_event *event)
1066 {
1067         __be64 *addr;
1068         u64 data;
1069
1070         /*
1071          * In-Memory Collection (IMC) counters are free flowing counters.
1072          * So we take a snapshot of the counter value on enable and save it
1073          * to calculate the delta at later stage to present the event counter
1074          * value.
1075          */
1076         addr = get_event_base_addr(event);
1077         data = be64_to_cpu(READ_ONCE(*addr));
1078         local64_set(&event->hw.prev_count, data);
1079
1080         return data;
1081 }
1082
1083 static void imc_event_update(struct perf_event *event)
1084 {
1085         u64 counter_prev, counter_new, final_count;
1086
1087         counter_prev = local64_read(&event->hw.prev_count);
1088         counter_new = imc_read_counter(event);
1089         final_count = counter_new - counter_prev;
1090
1091         /* Update the delta to the event count */
1092         local64_add(final_count, &event->count);
1093 }
1094
1095 static void imc_event_start(struct perf_event *event, int flags)
1096 {
1097         /*
1098          * In Memory Counters are free flowing counters. HW or the microcode
1099          * keeps adding to the counter offset in memory. To get event
1100          * counter value, we snapshot the value here and we calculate
1101          * delta at later point.
1102          */
1103         imc_read_counter(event);
1104 }
1105
1106 static void imc_event_stop(struct perf_event *event, int flags)
1107 {
1108         /*
1109          * Take a snapshot and calculate the delta and update
1110          * the event counter values.
1111          */
1112         imc_event_update(event);
1113 }
1114
1115 static int imc_event_add(struct perf_event *event, int flags)
1116 {
1117         if (flags & PERF_EF_START)
1118                 imc_event_start(event, flags);
1119
1120         return 0;
1121 }
1122
1123 static int thread_imc_event_add(struct perf_event *event, int flags)
1124 {
1125         int core_id;
1126         struct imc_pmu_ref *ref;
1127         u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, smp_processor_id());
1128
1129         if (flags & PERF_EF_START)
1130                 imc_event_start(event, flags);
1131
1132         if (!is_core_imc_mem_inited(smp_processor_id()))
1133                 return -EINVAL;
1134
1135         core_id = smp_processor_id() / threads_per_core;
1136         ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | THREAD_IMC_ENABLE;
1137         mtspr(SPRN_LDBAR, ldbar_value);
1138
1139         /*
1140          * imc pmus are enabled only when it is used.
1141          * See if this is triggered for the first time.
1142          * If yes, take the lock and enable the counters.
1143          * If not, just increment the count in ref count struct.
1144          */
1145         ref = &core_imc_refc[core_id];
1146         if (!ref)
1147                 return -EINVAL;
1148
1149         spin_lock(&ref->lock);
1150         if (ref->refc == 0) {
1151                 if (opal_imc_counters_start(OPAL_IMC_COUNTERS_CORE,
1152                     get_hard_smp_processor_id(smp_processor_id()))) {
1153                         spin_unlock(&ref->lock);
1154                         pr_err("thread-imc: Unable to start the counter\
1155                                 for core %d\n", core_id);
1156                         return -EINVAL;
1157                 }
1158         }
1159         ++ref->refc;
1160         spin_unlock(&ref->lock);
1161         return 0;
1162 }
1163
1164 static void thread_imc_event_del(struct perf_event *event, int flags)
1165 {
1166
1167         int core_id;
1168         struct imc_pmu_ref *ref;
1169
1170         core_id = smp_processor_id() / threads_per_core;
1171         ref = &core_imc_refc[core_id];
1172         if (!ref) {
1173                 pr_debug("imc: Failed to get event reference count\n");
1174                 return;
1175         }
1176
1177         spin_lock(&ref->lock);
1178         ref->refc--;
1179         if (ref->refc == 0) {
1180                 if (opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE,
1181                     get_hard_smp_processor_id(smp_processor_id()))) {
1182                         spin_unlock(&ref->lock);
1183                         pr_err("thread-imc: Unable to stop the counters\
1184                                 for core %d\n", core_id);
1185                         return;
1186                 }
1187         } else if (ref->refc < 0) {
1188                 ref->refc = 0;
1189         }
1190         spin_unlock(&ref->lock);
1191
1192         /* Set bit 0 of LDBAR to zero, to stop posting updates to memory */
1193         mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) & (~(1UL << 63))));
1194
1195         /*
1196          * Take a snapshot and calculate the delta and update
1197          * the event counter values.
1198          */
1199         imc_event_update(event);
1200 }
1201
1202 /*
1203  * Allocate a page of memory for each cpu, and load LDBAR with 0.
1204  */
1205 static int trace_imc_mem_alloc(int cpu_id, int size)
1206 {
1207         u64 *local_mem = per_cpu(trace_imc_mem, cpu_id);
1208         int phys_id = cpu_to_node(cpu_id), rc = 0;
1209         int core_id = (cpu_id / threads_per_core);
1210
1211         if (!local_mem) {
1212                 struct page *page;
1213
1214                 page = alloc_pages_node(phys_id,
1215                                 GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE |
1216                                 __GFP_NOWARN, get_order(size));
1217                 if (!page)
1218                         return -ENOMEM;
1219                 local_mem = page_address(page);
1220                 per_cpu(trace_imc_mem, cpu_id) = local_mem;
1221
1222                 /* Initialise the counters for trace mode */
1223                 rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_TRACE, __pa((void *)local_mem),
1224                                             get_hard_smp_processor_id(cpu_id));
1225                 if (rc) {
1226                         pr_info("IMC:opal init failed for trace imc\n");
1227                         return rc;
1228                 }
1229         }
1230
1231         trace_imc_refc[core_id].id = core_id;
1232         spin_lock_init(&trace_imc_refc[core_id].lock);
1233
1234         mtspr(SPRN_LDBAR, 0);
1235         return 0;
1236 }
1237
1238 static int ppc_trace_imc_cpu_online(unsigned int cpu)
1239 {
1240         return trace_imc_mem_alloc(cpu, trace_imc_mem_size);
1241 }
1242
1243 static int ppc_trace_imc_cpu_offline(unsigned int cpu)
1244 {
1245         /*
1246          * No need to set bit 0 of LDBAR to zero, as
1247          * it is set to zero for imc trace-mode
1248          *
1249          * Reduce the refc if any trace-imc event running
1250          * on this cpu.
1251          */
1252         spin_lock(&imc_global_refc.lock);
1253         if (imc_global_refc.id == IMC_DOMAIN_TRACE)
1254                 imc_global_refc.refc--;
1255         spin_unlock(&imc_global_refc.lock);
1256
1257         return 0;
1258 }
1259
1260 static int trace_imc_cpu_init(void)
1261 {
1262         return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE,
1263                           "perf/powerpc/imc_trace:online",
1264                           ppc_trace_imc_cpu_online,
1265                           ppc_trace_imc_cpu_offline);
1266 }
1267
1268 static u64 get_trace_imc_event_base_addr(void)
1269 {
1270         return (u64)per_cpu(trace_imc_mem, smp_processor_id());
1271 }
1272
1273 /*
1274  * Function to parse trace-imc data obtained
1275  * and to prepare the perf sample.
1276  */
1277 static int trace_imc_prepare_sample(struct trace_imc_data *mem,
1278                                     struct perf_sample_data *data,
1279                                     u64 *prev_tb,
1280                                     struct perf_event_header *header,
1281                                     struct perf_event *event)
1282 {
1283         /* Sanity checks for a valid record */
1284         if (be64_to_cpu(READ_ONCE(mem->tb1)) > *prev_tb)
1285                 *prev_tb = be64_to_cpu(READ_ONCE(mem->tb1));
1286         else
1287                 return -EINVAL;
1288
1289         if ((be64_to_cpu(READ_ONCE(mem->tb1)) & IMC_TRACE_RECORD_TB1_MASK) !=
1290                          be64_to_cpu(READ_ONCE(mem->tb2)))
1291                 return -EINVAL;
1292
1293         /* Prepare perf sample */
1294         data->ip =  be64_to_cpu(READ_ONCE(mem->ip));
1295         data->period = event->hw.last_period;
1296
1297         header->type = PERF_RECORD_SAMPLE;
1298         header->size = sizeof(*header) + event->header_size;
1299         header->misc = 0;
1300
1301         if (cpu_has_feature(CPU_FTR_ARCH_31)) {
1302                 switch (IMC_TRACE_RECORD_VAL_HVPR(be64_to_cpu(READ_ONCE(mem->val)))) {
1303                 case 0:/* when MSR HV and PR not set in the trace-record */
1304                         header->misc |= PERF_RECORD_MISC_GUEST_KERNEL;
1305                         break;
1306                 case 1: /* MSR HV is 0 and PR is 1 */
1307                         header->misc |= PERF_RECORD_MISC_GUEST_USER;
1308                         break;
1309                 case 2: /* MSR HV is 1 and PR is 0 */
1310                         header->misc |= PERF_RECORD_MISC_KERNEL;
1311                         break;
1312                 case 3: /* MSR HV is 1 and PR is 1 */
1313                         header->misc |= PERF_RECORD_MISC_USER;
1314                         break;
1315                 default:
1316                         pr_info("IMC: Unable to set the flag based on MSR bits\n");
1317                         break;
1318                 }
1319         } else {
1320                 if (is_kernel_addr(data->ip))
1321                         header->misc |= PERF_RECORD_MISC_KERNEL;
1322                 else
1323                         header->misc |= PERF_RECORD_MISC_USER;
1324         }
1325         perf_event_header__init_id(header, data, event);
1326
1327         return 0;
1328 }
1329
1330 static void dump_trace_imc_data(struct perf_event *event)
1331 {
1332         struct trace_imc_data *mem;
1333         int i, ret;
1334         u64 prev_tb = 0;
1335
1336         mem = (struct trace_imc_data *)get_trace_imc_event_base_addr();
1337         for (i = 0; i < (trace_imc_mem_size / sizeof(struct trace_imc_data));
1338                 i++, mem++) {
1339                 struct perf_sample_data data;
1340                 struct perf_event_header header;
1341
1342                 ret = trace_imc_prepare_sample(mem, &data, &prev_tb, &header, event);
1343                 if (ret) /* Exit, if not a valid record */
1344                         break;
1345                 else {
1346                         /* If this is a valid record, create the sample */
1347                         struct perf_output_handle handle;
1348
1349                         if (perf_output_begin(&handle, &data, event, header.size))
1350                                 return;
1351
1352                         perf_output_sample(&handle, &header, &data, event);
1353                         perf_output_end(&handle);
1354                 }
1355         }
1356 }
1357
1358 static int trace_imc_event_add(struct perf_event *event, int flags)
1359 {
1360         int core_id = smp_processor_id() / threads_per_core;
1361         struct imc_pmu_ref *ref = NULL;
1362         u64 local_mem, ldbar_value;
1363
1364         /* Set trace-imc bit in ldbar and load ldbar with per-thread memory address */
1365         local_mem = get_trace_imc_event_base_addr();
1366         ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | TRACE_IMC_ENABLE;
1367
1368         /* trace-imc reference count */
1369         if (trace_imc_refc)
1370                 ref = &trace_imc_refc[core_id];
1371         if (!ref) {
1372                 pr_debug("imc: Failed to get the event reference count\n");
1373                 return -EINVAL;
1374         }
1375
1376         mtspr(SPRN_LDBAR, ldbar_value);
1377         spin_lock(&ref->lock);
1378         if (ref->refc == 0) {
1379                 if (opal_imc_counters_start(OPAL_IMC_COUNTERS_TRACE,
1380                                 get_hard_smp_processor_id(smp_processor_id()))) {
1381                         spin_unlock(&ref->lock);
1382                         pr_err("trace-imc: Unable to start the counters for core %d\n", core_id);
1383                         return -EINVAL;
1384                 }
1385         }
1386         ++ref->refc;
1387         spin_unlock(&ref->lock);
1388         return 0;
1389 }
1390
1391 static void trace_imc_event_read(struct perf_event *event)
1392 {
1393         return;
1394 }
1395
1396 static void trace_imc_event_stop(struct perf_event *event, int flags)
1397 {
1398         u64 local_mem = get_trace_imc_event_base_addr();
1399         dump_trace_imc_data(event);
1400         memset((void *)local_mem, 0, sizeof(u64));
1401 }
1402
1403 static void trace_imc_event_start(struct perf_event *event, int flags)
1404 {
1405         return;
1406 }
1407
1408 static void trace_imc_event_del(struct perf_event *event, int flags)
1409 {
1410         int core_id = smp_processor_id() / threads_per_core;
1411         struct imc_pmu_ref *ref = NULL;
1412
1413         if (trace_imc_refc)
1414                 ref = &trace_imc_refc[core_id];
1415         if (!ref) {
1416                 pr_debug("imc: Failed to get event reference count\n");
1417                 return;
1418         }
1419
1420         spin_lock(&ref->lock);
1421         ref->refc--;
1422         if (ref->refc == 0) {
1423                 if (opal_imc_counters_stop(OPAL_IMC_COUNTERS_TRACE,
1424                                 get_hard_smp_processor_id(smp_processor_id()))) {
1425                         spin_unlock(&ref->lock);
1426                         pr_err("trace-imc: Unable to stop the counters for core %d\n", core_id);
1427                         return;
1428                 }
1429         } else if (ref->refc < 0) {
1430                 ref->refc = 0;
1431         }
1432         spin_unlock(&ref->lock);
1433
1434         trace_imc_event_stop(event, flags);
1435 }
1436
1437 static int trace_imc_event_init(struct perf_event *event)
1438 {
1439         if (event->attr.type != event->pmu->type)
1440                 return -ENOENT;
1441
1442         if (!perfmon_capable())
1443                 return -EACCES;
1444
1445         /* Return if this is a couting event */
1446         if (event->attr.sample_period == 0)
1447                 return -ENOENT;
1448
1449         /*
1450          * Take the global lock, and make sure
1451          * no other thread is running any core/thread imc
1452          * events
1453          */
1454         spin_lock(&imc_global_refc.lock);
1455         if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_TRACE) {
1456                 /*
1457                  * No core/thread imc events are running in the
1458                  * system, so set the refc.id to trace-imc.
1459                  */
1460                 imc_global_refc.id = IMC_DOMAIN_TRACE;
1461                 imc_global_refc.refc++;
1462         } else {
1463                 spin_unlock(&imc_global_refc.lock);
1464                 return -EBUSY;
1465         }
1466         spin_unlock(&imc_global_refc.lock);
1467
1468         event->hw.idx = -1;
1469
1470         /*
1471          * There can only be a single PMU for perf_hw_context events which is assigned to
1472          * core PMU. Hence use "perf_sw_context" for trace_imc.
1473          */
1474         event->pmu->task_ctx_nr = perf_sw_context;
1475         event->destroy = reset_global_refc;
1476         return 0;
1477 }
1478
1479 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
1480 static int update_pmu_ops(struct imc_pmu *pmu)
1481 {
1482         pmu->pmu.task_ctx_nr = perf_invalid_context;
1483         pmu->pmu.add = imc_event_add;
1484         pmu->pmu.del = imc_event_stop;
1485         pmu->pmu.start = imc_event_start;
1486         pmu->pmu.stop = imc_event_stop;
1487         pmu->pmu.read = imc_event_update;
1488         pmu->pmu.attr_groups = pmu->attr_groups;
1489         pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
1490         pmu->attr_groups[IMC_FORMAT_ATTR] = &imc_format_group;
1491
1492         switch (pmu->domain) {
1493         case IMC_DOMAIN_NEST:
1494                 pmu->pmu.event_init = nest_imc_event_init;
1495                 pmu->attr_groups[IMC_CPUMASK_ATTR] = &imc_pmu_cpumask_attr_group;
1496                 break;
1497         case IMC_DOMAIN_CORE:
1498                 pmu->pmu.event_init = core_imc_event_init;
1499                 pmu->attr_groups[IMC_CPUMASK_ATTR] = &imc_pmu_cpumask_attr_group;
1500                 break;
1501         case IMC_DOMAIN_THREAD:
1502                 pmu->pmu.event_init = thread_imc_event_init;
1503                 pmu->pmu.add = thread_imc_event_add;
1504                 pmu->pmu.del = thread_imc_event_del;
1505                 pmu->pmu.start_txn = thread_imc_pmu_start_txn;
1506                 pmu->pmu.cancel_txn = thread_imc_pmu_cancel_txn;
1507                 pmu->pmu.commit_txn = thread_imc_pmu_commit_txn;
1508                 break;
1509         case IMC_DOMAIN_TRACE:
1510                 pmu->pmu.event_init = trace_imc_event_init;
1511                 pmu->pmu.add = trace_imc_event_add;
1512                 pmu->pmu.del = trace_imc_event_del;
1513                 pmu->pmu.start = trace_imc_event_start;
1514                 pmu->pmu.stop = trace_imc_event_stop;
1515                 pmu->pmu.read = trace_imc_event_read;
1516                 pmu->attr_groups[IMC_FORMAT_ATTR] = &trace_imc_format_group;
1517                 break;
1518         default:
1519                 break;
1520         }
1521
1522         return 0;
1523 }
1524
1525 /* init_nest_pmu_ref: Initialize the imc_pmu_ref struct for all the nodes */
1526 static int init_nest_pmu_ref(void)
1527 {
1528         int nid, i, cpu;
1529
1530         nest_imc_refc = kcalloc(num_possible_nodes(), sizeof(*nest_imc_refc),
1531                                                                 GFP_KERNEL);
1532
1533         if (!nest_imc_refc)
1534                 return -ENOMEM;
1535
1536         i = 0;
1537         for_each_node(nid) {
1538                 /*
1539                  * Take the lock to avoid races while tracking the number of
1540                  * sessions using the chip's nest pmu units.
1541                  */
1542                 spin_lock_init(&nest_imc_refc[i].lock);
1543
1544                 /*
1545                  * Loop to init the "id" with the node_id. Variable "i" initialized to
1546                  * 0 and will be used as index to the array. "i" will not go off the
1547                  * end of the array since the "for_each_node" loops for "N_POSSIBLE"
1548                  * nodes only.
1549                  */
1550                 nest_imc_refc[i++].id = nid;
1551         }
1552
1553         /*
1554          * Loop to init the per_cpu "local_nest_imc_refc" with the proper
1555          * "nest_imc_refc" index. This makes get_nest_pmu_ref() alot simple.
1556          */
1557         for_each_possible_cpu(cpu) {
1558                 nid = cpu_to_node(cpu);
1559                 for (i = 0; i < num_possible_nodes(); i++) {
1560                         if (nest_imc_refc[i].id == nid) {
1561                                 per_cpu(local_nest_imc_refc, cpu) = &nest_imc_refc[i];
1562                                 break;
1563                         }
1564                 }
1565         }
1566         return 0;
1567 }
1568
1569 static void cleanup_all_core_imc_memory(void)
1570 {
1571         int i, nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core);
1572         struct imc_mem_info *ptr = core_imc_pmu->mem_info;
1573         int size = core_imc_pmu->counter_mem_size;
1574
1575         /* mem_info will never be NULL */
1576         for (i = 0; i < nr_cores; i++) {
1577                 if (ptr[i].vbase)
1578                         free_pages((u64)ptr[i].vbase, get_order(size));
1579         }
1580
1581         kfree(ptr);
1582         kfree(core_imc_refc);
1583 }
1584
1585 static void thread_imc_ldbar_disable(void *dummy)
1586 {
1587         /*
1588          * By setting 0th bit of LDBAR to zero, we disable thread-imc
1589          * updates to memory.
1590          */
1591         mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) & (~(1UL << 63))));
1592 }
1593
1594 void thread_imc_disable(void)
1595 {
1596         on_each_cpu(thread_imc_ldbar_disable, NULL, 1);
1597 }
1598
1599 static void cleanup_all_thread_imc_memory(void)
1600 {
1601         int i, order = get_order(thread_imc_mem_size);
1602
1603         for_each_online_cpu(i) {
1604                 if (per_cpu(thread_imc_mem, i))
1605                         free_pages((u64)per_cpu(thread_imc_mem, i), order);
1606
1607         }
1608 }
1609
1610 static void cleanup_all_trace_imc_memory(void)
1611 {
1612         int i, order = get_order(trace_imc_mem_size);
1613
1614         for_each_online_cpu(i) {
1615                 if (per_cpu(trace_imc_mem, i))
1616                         free_pages((u64)per_cpu(trace_imc_mem, i), order);
1617
1618         }
1619         kfree(trace_imc_refc);
1620 }
1621
1622 /* Function to free the attr_groups which are dynamically allocated */
1623 static void imc_common_mem_free(struct imc_pmu *pmu_ptr)
1624 {
1625         if (pmu_ptr->attr_groups[IMC_EVENT_ATTR])
1626                 kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs);
1627         kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]);
1628 }
1629
1630 /*
1631  * Common function to unregister cpu hotplug callback and
1632  * free the memory.
1633  * TODO: Need to handle pmu unregistering, which will be
1634  * done in followup series.
1635  */
1636 static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr)
1637 {
1638         if (pmu_ptr->domain == IMC_DOMAIN_NEST) {
1639                 mutex_lock(&nest_init_lock);
1640                 if (nest_pmus == 1) {
1641                         cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE);
1642                         kfree(nest_imc_refc);
1643                         kfree(per_nest_pmu_arr);
1644                         per_nest_pmu_arr = NULL;
1645                 }
1646
1647                 if (nest_pmus > 0)
1648                         nest_pmus--;
1649                 mutex_unlock(&nest_init_lock);
1650         }
1651
1652         /* Free core_imc memory */
1653         if (pmu_ptr->domain == IMC_DOMAIN_CORE) {
1654                 cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE);
1655                 cleanup_all_core_imc_memory();
1656         }
1657
1658         /* Free thread_imc memory */
1659         if (pmu_ptr->domain == IMC_DOMAIN_THREAD) {
1660                 cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE);
1661                 cleanup_all_thread_imc_memory();
1662         }
1663
1664         if (pmu_ptr->domain == IMC_DOMAIN_TRACE) {
1665                 cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE);
1666                 cleanup_all_trace_imc_memory();
1667         }
1668 }
1669
1670 /*
1671  * Function to unregister thread-imc if core-imc
1672  * is not registered.
1673  */
1674 void unregister_thread_imc(void)
1675 {
1676         imc_common_cpuhp_mem_free(thread_imc_pmu);
1677         imc_common_mem_free(thread_imc_pmu);
1678         perf_pmu_unregister(&thread_imc_pmu->pmu);
1679 }
1680
1681 /*
1682  * imc_mem_init : Function to support memory allocation for core imc.
1683  */
1684 static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent,
1685                                                                 int pmu_index)
1686 {
1687         const char *s;
1688         int nr_cores, cpu, res = -ENOMEM;
1689
1690         if (of_property_read_string(parent, "name", &s))
1691                 return -ENODEV;
1692
1693         switch (pmu_ptr->domain) {
1694         case IMC_DOMAIN_NEST:
1695                 /* Update the pmu name */
1696                 pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s_imc", "nest_", s);
1697                 if (!pmu_ptr->pmu.name)
1698                         goto err;
1699
1700                 /* Needed for hotplug/migration */
1701                 if (!per_nest_pmu_arr) {
1702                         per_nest_pmu_arr = kcalloc(get_max_nest_dev() + 1,
1703                                                 sizeof(struct imc_pmu *),
1704                                                 GFP_KERNEL);
1705                         if (!per_nest_pmu_arr)
1706                                 goto err;
1707                 }
1708                 per_nest_pmu_arr[pmu_index] = pmu_ptr;
1709                 break;
1710         case IMC_DOMAIN_CORE:
1711                 /* Update the pmu name */
1712                 pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
1713                 if (!pmu_ptr->pmu.name)
1714                         goto err;
1715
1716                 nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core);
1717                 pmu_ptr->mem_info = kcalloc(nr_cores, sizeof(struct imc_mem_info),
1718                                                                 GFP_KERNEL);
1719
1720                 if (!pmu_ptr->mem_info)
1721                         goto err;
1722
1723                 core_imc_refc = kcalloc(nr_cores, sizeof(struct imc_pmu_ref),
1724                                                                 GFP_KERNEL);
1725
1726                 if (!core_imc_refc) {
1727                         kfree(pmu_ptr->mem_info);
1728                         goto err;
1729                 }
1730
1731                 core_imc_pmu = pmu_ptr;
1732                 break;
1733         case IMC_DOMAIN_THREAD:
1734                 /* Update the pmu name */
1735                 pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
1736                 if (!pmu_ptr->pmu.name)
1737                         goto err;
1738
1739                 thread_imc_mem_size = pmu_ptr->counter_mem_size;
1740                 for_each_online_cpu(cpu) {
1741                         res = thread_imc_mem_alloc(cpu, pmu_ptr->counter_mem_size);
1742                         if (res) {
1743                                 cleanup_all_thread_imc_memory();
1744                                 goto err;
1745                         }
1746                 }
1747
1748                 thread_imc_pmu = pmu_ptr;
1749                 break;
1750         case IMC_DOMAIN_TRACE:
1751                 /* Update the pmu name */
1752                 pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
1753                 if (!pmu_ptr->pmu.name)
1754                         return -ENOMEM;
1755
1756                 nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core);
1757                 trace_imc_refc = kcalloc(nr_cores, sizeof(struct imc_pmu_ref),
1758                                                                 GFP_KERNEL);
1759                 if (!trace_imc_refc)
1760                         return -ENOMEM;
1761
1762                 trace_imc_mem_size = pmu_ptr->counter_mem_size;
1763                 for_each_online_cpu(cpu) {
1764                         res = trace_imc_mem_alloc(cpu, trace_imc_mem_size);
1765                         if (res) {
1766                                 cleanup_all_trace_imc_memory();
1767                                 goto err;
1768                         }
1769                 }
1770                 break;
1771         default:
1772                 return -EINVAL;
1773         }
1774
1775         return 0;
1776 err:
1777         return res;
1778 }
1779
1780 /*
1781  * init_imc_pmu : Setup and register the IMC pmu device.
1782  *
1783  * @parent:     Device tree unit node
1784  * @pmu_ptr:    memory allocated for this pmu
1785  * @pmu_idx:    Count of nest pmc registered
1786  *
1787  * init_imc_pmu() setup pmu cpumask and registers for a cpu hotplug callback.
1788  * Handles failure cases and accordingly frees memory.
1789  */
1790 int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_idx)
1791 {
1792         int ret;
1793
1794         ret = imc_mem_init(pmu_ptr, parent, pmu_idx);
1795         if (ret)
1796                 goto err_free_mem;
1797
1798         switch (pmu_ptr->domain) {
1799         case IMC_DOMAIN_NEST:
1800                 /*
1801                 * Nest imc pmu need only one cpu per chip, we initialize the
1802                 * cpumask for the first nest imc pmu and use the same for the
1803                 * rest. To handle the cpuhotplug callback unregister, we track
1804                 * the number of nest pmus in "nest_pmus".
1805                 */
1806                 mutex_lock(&nest_init_lock);
1807                 if (nest_pmus == 0) {
1808                         ret = init_nest_pmu_ref();
1809                         if (ret) {
1810                                 mutex_unlock(&nest_init_lock);
1811                                 kfree(per_nest_pmu_arr);
1812                                 per_nest_pmu_arr = NULL;
1813                                 goto err_free_mem;
1814                         }
1815                         /* Register for cpu hotplug notification. */
1816                         ret = nest_pmu_cpumask_init();
1817                         if (ret) {
1818                                 mutex_unlock(&nest_init_lock);
1819                                 kfree(nest_imc_refc);
1820                                 kfree(per_nest_pmu_arr);
1821                                 per_nest_pmu_arr = NULL;
1822                                 goto err_free_mem;
1823                         }
1824                 }
1825                 nest_pmus++;
1826                 mutex_unlock(&nest_init_lock);
1827                 break;
1828         case IMC_DOMAIN_CORE:
1829                 ret = core_imc_pmu_cpumask_init();
1830                 if (ret) {
1831                         cleanup_all_core_imc_memory();
1832                         goto err_free_mem;
1833                 }
1834
1835                 break;
1836         case IMC_DOMAIN_THREAD:
1837                 ret = thread_imc_cpu_init();
1838                 if (ret) {
1839                         cleanup_all_thread_imc_memory();
1840                         goto err_free_mem;
1841                 }
1842
1843                 break;
1844         case IMC_DOMAIN_TRACE:
1845                 ret = trace_imc_cpu_init();
1846                 if (ret) {
1847                         cleanup_all_trace_imc_memory();
1848                         goto err_free_mem;
1849                 }
1850
1851                 break;
1852         default:
1853                 return  -EINVAL;        /* Unknown domain */
1854         }
1855
1856         ret = update_events_in_group(parent, pmu_ptr);
1857         if (ret)
1858                 goto err_free_cpuhp_mem;
1859
1860         ret = update_pmu_ops(pmu_ptr);
1861         if (ret)
1862                 goto err_free_cpuhp_mem;
1863
1864         ret = perf_pmu_register(&pmu_ptr->pmu, pmu_ptr->pmu.name, -1);
1865         if (ret)
1866                 goto err_free_cpuhp_mem;
1867
1868         pr_debug("%s performance monitor hardware support registered\n",
1869                                                         pmu_ptr->pmu.name);
1870
1871         return 0;
1872
1873 err_free_cpuhp_mem:
1874         imc_common_cpuhp_mem_free(pmu_ptr);
1875 err_free_mem:
1876         imc_common_mem_free(pmu_ptr);
1877         return ret;
1878 }