GNU Linux-libre 5.19-rc6-gnu
[releases.git] / arch / powerpc / perf / imc-pmu.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * In-Memory Collection (IMC) Performance Monitor counter support.
4  *
5  * Copyright (C) 2017 Madhavan Srinivasan, IBM Corporation.
6  *           (C) 2017 Anju T Sudhakar, IBM Corporation.
7  *           (C) 2017 Hemant K Shaw, IBM Corporation.
8  */
9 #include <linux/of.h>
10 #include <linux/perf_event.h>
11 #include <linux/slab.h>
12 #include <asm/opal.h>
13 #include <asm/imc-pmu.h>
14 #include <asm/cputhreads.h>
15 #include <asm/smp.h>
16 #include <linux/string.h>
17
18 /* Nest IMC data structures and variables */
19
20 /*
21  * Used to avoid races in counting the nest-pmu units during hotplug
22  * register and unregister
23  */
24 static DEFINE_MUTEX(nest_init_lock);
25 static DEFINE_PER_CPU(struct imc_pmu_ref *, local_nest_imc_refc);
26 static struct imc_pmu **per_nest_pmu_arr;
27 static cpumask_t nest_imc_cpumask;
28 static struct imc_pmu_ref *nest_imc_refc;
29 static int nest_pmus;
30
31 /* Core IMC data structures and variables */
32
33 static cpumask_t core_imc_cpumask;
34 static struct imc_pmu_ref *core_imc_refc;
35 static struct imc_pmu *core_imc_pmu;
36
37 /* Thread IMC data structures and variables */
38
39 static DEFINE_PER_CPU(u64 *, thread_imc_mem);
40 static struct imc_pmu *thread_imc_pmu;
41 static int thread_imc_mem_size;
42
43 /* Trace IMC data structures */
44 static DEFINE_PER_CPU(u64 *, trace_imc_mem);
45 static struct imc_pmu_ref *trace_imc_refc;
46 static int trace_imc_mem_size;
47
48 /*
49  * Global data structure used to avoid races between thread,
50  * core and trace-imc
51  */
52 static struct imc_pmu_ref imc_global_refc = {
53         .lock = __MUTEX_INITIALIZER(imc_global_refc.lock),
54         .id = 0,
55         .refc = 0,
56 };
57
58 static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
59 {
60         return container_of(event->pmu, struct imc_pmu, pmu);
61 }
62
63 PMU_FORMAT_ATTR(event, "config:0-61");
64 PMU_FORMAT_ATTR(offset, "config:0-31");
65 PMU_FORMAT_ATTR(rvalue, "config:32");
66 PMU_FORMAT_ATTR(mode, "config:33-40");
67 static struct attribute *imc_format_attrs[] = {
68         &format_attr_event.attr,
69         &format_attr_offset.attr,
70         &format_attr_rvalue.attr,
71         &format_attr_mode.attr,
72         NULL,
73 };
74
75 static const struct attribute_group imc_format_group = {
76         .name = "format",
77         .attrs = imc_format_attrs,
78 };
79
80 /* Format attribute for imc trace-mode */
81 PMU_FORMAT_ATTR(cpmc_reserved, "config:0-19");
82 PMU_FORMAT_ATTR(cpmc_event, "config:20-27");
83 PMU_FORMAT_ATTR(cpmc_samplesel, "config:28-29");
84 PMU_FORMAT_ATTR(cpmc_load, "config:30-61");
85 static struct attribute *trace_imc_format_attrs[] = {
86         &format_attr_event.attr,
87         &format_attr_cpmc_reserved.attr,
88         &format_attr_cpmc_event.attr,
89         &format_attr_cpmc_samplesel.attr,
90         &format_attr_cpmc_load.attr,
91         NULL,
92 };
93
94 static const struct attribute_group trace_imc_format_group = {
95 .name = "format",
96 .attrs = trace_imc_format_attrs,
97 };
98
99 /* Get the cpumask printed to a buffer "buf" */
100 static ssize_t imc_pmu_cpumask_get_attr(struct device *dev,
101                                         struct device_attribute *attr,
102                                         char *buf)
103 {
104         struct pmu *pmu = dev_get_drvdata(dev);
105         struct imc_pmu *imc_pmu = container_of(pmu, struct imc_pmu, pmu);
106         cpumask_t *active_mask;
107
108         switch(imc_pmu->domain){
109         case IMC_DOMAIN_NEST:
110                 active_mask = &nest_imc_cpumask;
111                 break;
112         case IMC_DOMAIN_CORE:
113                 active_mask = &core_imc_cpumask;
114                 break;
115         default:
116                 return 0;
117         }
118
119         return cpumap_print_to_pagebuf(true, buf, active_mask);
120 }
121
122 static DEVICE_ATTR(cpumask, S_IRUGO, imc_pmu_cpumask_get_attr, NULL);
123
124 static struct attribute *imc_pmu_cpumask_attrs[] = {
125         &dev_attr_cpumask.attr,
126         NULL,
127 };
128
129 static const struct attribute_group imc_pmu_cpumask_attr_group = {
130         .attrs = imc_pmu_cpumask_attrs,
131 };
132
133 /* device_str_attr_create : Populate event "name" and string "str" in attribute */
134 static struct attribute *device_str_attr_create(const char *name, const char *str)
135 {
136         struct perf_pmu_events_attr *attr;
137
138         attr = kzalloc(sizeof(*attr), GFP_KERNEL);
139         if (!attr)
140                 return NULL;
141         sysfs_attr_init(&attr->attr.attr);
142
143         attr->event_str = str;
144         attr->attr.attr.name = name;
145         attr->attr.attr.mode = 0444;
146         attr->attr.show = perf_event_sysfs_show;
147
148         return &attr->attr.attr;
149 }
150
151 static int imc_parse_event(struct device_node *np, const char *scale,
152                                   const char *unit, const char *prefix,
153                                   u32 base, struct imc_events *event)
154 {
155         const char *s;
156         u32 reg;
157
158         if (of_property_read_u32(np, "reg", &reg))
159                 goto error;
160         /* Add the base_reg value to the "reg" */
161         event->value = base + reg;
162
163         if (of_property_read_string(np, "event-name", &s))
164                 goto error;
165
166         event->name = kasprintf(GFP_KERNEL, "%s%s", prefix, s);
167         if (!event->name)
168                 goto error;
169
170         if (of_property_read_string(np, "scale", &s))
171                 s = scale;
172
173         if (s) {
174                 event->scale = kstrdup(s, GFP_KERNEL);
175                 if (!event->scale)
176                         goto error;
177         }
178
179         if (of_property_read_string(np, "unit", &s))
180                 s = unit;
181
182         if (s) {
183                 event->unit = kstrdup(s, GFP_KERNEL);
184                 if (!event->unit)
185                         goto error;
186         }
187
188         return 0;
189 error:
190         kfree(event->unit);
191         kfree(event->scale);
192         kfree(event->name);
193         return -EINVAL;
194 }
195
196 /*
197  * imc_free_events: Function to cleanup the events list, having
198  *                  "nr_entries".
199  */
200 static void imc_free_events(struct imc_events *events, int nr_entries)
201 {
202         int i;
203
204         /* Nothing to clean, return */
205         if (!events)
206                 return;
207         for (i = 0; i < nr_entries; i++) {
208                 kfree(events[i].unit);
209                 kfree(events[i].scale);
210                 kfree(events[i].name);
211         }
212
213         kfree(events);
214 }
215
216 /*
217  * update_events_in_group: Update the "events" information in an attr_group
218  *                         and assign the attr_group to the pmu "pmu".
219  */
220 static int update_events_in_group(struct device_node *node, struct imc_pmu *pmu)
221 {
222         struct attribute_group *attr_group;
223         struct attribute **attrs, *dev_str;
224         struct device_node *np, *pmu_events;
225         u32 handle, base_reg;
226         int i = 0, j = 0, ct, ret;
227         const char *prefix, *g_scale, *g_unit;
228         const char *ev_val_str, *ev_scale_str, *ev_unit_str;
229
230         if (!of_property_read_u32(node, "events", &handle))
231                 pmu_events = of_find_node_by_phandle(handle);
232         else
233                 return 0;
234
235         /* Did not find any node with a given phandle */
236         if (!pmu_events)
237                 return 0;
238
239         /* Get a count of number of child nodes */
240         ct = of_get_child_count(pmu_events);
241
242         /* Get the event prefix */
243         if (of_property_read_string(node, "events-prefix", &prefix))
244                 return 0;
245
246         /* Get a global unit and scale data if available */
247         if (of_property_read_string(node, "scale", &g_scale))
248                 g_scale = NULL;
249
250         if (of_property_read_string(node, "unit", &g_unit))
251                 g_unit = NULL;
252
253         /* "reg" property gives out the base offset of the counters data */
254         of_property_read_u32(node, "reg", &base_reg);
255
256         /* Allocate memory for the events */
257         pmu->events = kcalloc(ct, sizeof(struct imc_events), GFP_KERNEL);
258         if (!pmu->events)
259                 return -ENOMEM;
260
261         ct = 0;
262         /* Parse the events and update the struct */
263         for_each_child_of_node(pmu_events, np) {
264                 ret = imc_parse_event(np, g_scale, g_unit, prefix, base_reg, &pmu->events[ct]);
265                 if (!ret)
266                         ct++;
267         }
268
269         /* Allocate memory for attribute group */
270         attr_group = kzalloc(sizeof(*attr_group), GFP_KERNEL);
271         if (!attr_group) {
272                 imc_free_events(pmu->events, ct);
273                 return -ENOMEM;
274         }
275
276         /*
277          * Allocate memory for attributes.
278          * Since we have count of events for this pmu, we also allocate
279          * memory for the scale and unit attribute for now.
280          * "ct" has the total event structs added from the events-parent node.
281          * So allocate three times the "ct" (this includes event, event_scale and
282          * event_unit).
283          */
284         attrs = kcalloc(((ct * 3) + 1), sizeof(struct attribute *), GFP_KERNEL);
285         if (!attrs) {
286                 kfree(attr_group);
287                 imc_free_events(pmu->events, ct);
288                 return -ENOMEM;
289         }
290
291         attr_group->name = "events";
292         attr_group->attrs = attrs;
293         do {
294                 ev_val_str = kasprintf(GFP_KERNEL, "event=0x%x", pmu->events[i].value);
295                 dev_str = device_str_attr_create(pmu->events[i].name, ev_val_str);
296                 if (!dev_str)
297                         continue;
298
299                 attrs[j++] = dev_str;
300                 if (pmu->events[i].scale) {
301                         ev_scale_str = kasprintf(GFP_KERNEL, "%s.scale", pmu->events[i].name);
302                         dev_str = device_str_attr_create(ev_scale_str, pmu->events[i].scale);
303                         if (!dev_str)
304                                 continue;
305
306                         attrs[j++] = dev_str;
307                 }
308
309                 if (pmu->events[i].unit) {
310                         ev_unit_str = kasprintf(GFP_KERNEL, "%s.unit", pmu->events[i].name);
311                         dev_str = device_str_attr_create(ev_unit_str, pmu->events[i].unit);
312                         if (!dev_str)
313                                 continue;
314
315                         attrs[j++] = dev_str;
316                 }
317         } while (++i < ct);
318
319         /* Save the event attribute */
320         pmu->attr_groups[IMC_EVENT_ATTR] = attr_group;
321
322         return 0;
323 }
324
325 /* get_nest_pmu_ref: Return the imc_pmu_ref struct for the given node */
326 static struct imc_pmu_ref *get_nest_pmu_ref(int cpu)
327 {
328         return per_cpu(local_nest_imc_refc, cpu);
329 }
330
331 static void nest_change_cpu_context(int old_cpu, int new_cpu)
332 {
333         struct imc_pmu **pn = per_nest_pmu_arr;
334
335         if (old_cpu < 0 || new_cpu < 0)
336                 return;
337
338         while (*pn) {
339                 perf_pmu_migrate_context(&(*pn)->pmu, old_cpu, new_cpu);
340                 pn++;
341         }
342 }
343
344 static int ppc_nest_imc_cpu_offline(unsigned int cpu)
345 {
346         int nid, target = -1;
347         const struct cpumask *l_cpumask;
348         struct imc_pmu_ref *ref;
349
350         /*
351          * Check in the designated list for this cpu. Dont bother
352          * if not one of them.
353          */
354         if (!cpumask_test_and_clear_cpu(cpu, &nest_imc_cpumask))
355                 return 0;
356
357         /*
358          * Check whether nest_imc is registered. We could end up here if the
359          * cpuhotplug callback registration fails. i.e, callback invokes the
360          * offline path for all successfully registered nodes. At this stage,
361          * nest_imc pmu will not be registered and we should return here.
362          *
363          * We return with a zero since this is not an offline failure. And
364          * cpuhp_setup_state() returns the actual failure reason to the caller,
365          * which in turn will call the cleanup routine.
366          */
367         if (!nest_pmus)
368                 return 0;
369
370         /*
371          * Now that this cpu is one of the designated,
372          * find a next cpu a) which is online and b) in same chip.
373          */
374         nid = cpu_to_node(cpu);
375         l_cpumask = cpumask_of_node(nid);
376         target = cpumask_last(l_cpumask);
377
378         /*
379          * If this(target) is the last cpu in the cpumask for this chip,
380          * check for any possible online cpu in the chip.
381          */
382         if (unlikely(target == cpu))
383                 target = cpumask_any_but(l_cpumask, cpu);
384
385         /*
386          * Update the cpumask with the target cpu and
387          * migrate the context if needed
388          */
389         if (target >= 0 && target < nr_cpu_ids) {
390                 cpumask_set_cpu(target, &nest_imc_cpumask);
391                 nest_change_cpu_context(cpu, target);
392         } else {
393                 opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST,
394                                        get_hard_smp_processor_id(cpu));
395                 /*
396                  * If this is the last cpu in this chip then, skip the reference
397                  * count mutex lock and make the reference count on this chip zero.
398                  */
399                 ref = get_nest_pmu_ref(cpu);
400                 if (!ref)
401                         return -EINVAL;
402
403                 ref->refc = 0;
404         }
405         return 0;
406 }
407
408 static int ppc_nest_imc_cpu_online(unsigned int cpu)
409 {
410         const struct cpumask *l_cpumask;
411         static struct cpumask tmp_mask;
412         int res;
413
414         /* Get the cpumask of this node */
415         l_cpumask = cpumask_of_node(cpu_to_node(cpu));
416
417         /*
418          * If this is not the first online CPU on this node, then
419          * just return.
420          */
421         if (cpumask_and(&tmp_mask, l_cpumask, &nest_imc_cpumask))
422                 return 0;
423
424         /*
425          * If this is the first online cpu on this node
426          * disable the nest counters by making an OPAL call.
427          */
428         res = opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST,
429                                      get_hard_smp_processor_id(cpu));
430         if (res)
431                 return res;
432
433         /* Make this CPU the designated target for counter collection */
434         cpumask_set_cpu(cpu, &nest_imc_cpumask);
435         return 0;
436 }
437
438 static int nest_pmu_cpumask_init(void)
439 {
440         return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE,
441                                  "perf/powerpc/imc:online",
442                                  ppc_nest_imc_cpu_online,
443                                  ppc_nest_imc_cpu_offline);
444 }
445
446 static void nest_imc_counters_release(struct perf_event *event)
447 {
448         int rc, node_id;
449         struct imc_pmu_ref *ref;
450
451         if (event->cpu < 0)
452                 return;
453
454         node_id = cpu_to_node(event->cpu);
455
456         /*
457          * See if we need to disable the nest PMU.
458          * If no events are currently in use, then we have to take a
459          * mutex to ensure that we don't race with another task doing
460          * enable or disable the nest counters.
461          */
462         ref = get_nest_pmu_ref(event->cpu);
463         if (!ref)
464                 return;
465
466         /* Take the mutex lock for this node and then decrement the reference count */
467         mutex_lock(&ref->lock);
468         if (ref->refc == 0) {
469                 /*
470                  * The scenario where this is true is, when perf session is
471                  * started, followed by offlining of all cpus in a given node.
472                  *
473                  * In the cpuhotplug offline path, ppc_nest_imc_cpu_offline()
474                  * function set the ref->count to zero, if the cpu which is
475                  * about to offline is the last cpu in a given node and make
476                  * an OPAL call to disable the engine in that node.
477                  *
478                  */
479                 mutex_unlock(&ref->lock);
480                 return;
481         }
482         ref->refc--;
483         if (ref->refc == 0) {
484                 rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST,
485                                             get_hard_smp_processor_id(event->cpu));
486                 if (rc) {
487                         mutex_unlock(&ref->lock);
488                         pr_err("nest-imc: Unable to stop the counters for core %d\n", node_id);
489                         return;
490                 }
491         } else if (ref->refc < 0) {
492                 WARN(1, "nest-imc: Invalid event reference count\n");
493                 ref->refc = 0;
494         }
495         mutex_unlock(&ref->lock);
496 }
497
498 static int nest_imc_event_init(struct perf_event *event)
499 {
500         int chip_id, rc, node_id;
501         u32 l_config, config = event->attr.config;
502         struct imc_mem_info *pcni;
503         struct imc_pmu *pmu;
504         struct imc_pmu_ref *ref;
505         bool flag = false;
506
507         if (event->attr.type != event->pmu->type)
508                 return -ENOENT;
509
510         /* Sampling not supported */
511         if (event->hw.sample_period)
512                 return -EINVAL;
513
514         if (event->cpu < 0)
515                 return -EINVAL;
516
517         pmu = imc_event_to_pmu(event);
518
519         /* Sanity check for config (event offset) */
520         if ((config & IMC_EVENT_OFFSET_MASK) > pmu->counter_mem_size)
521                 return -EINVAL;
522
523         /*
524          * Nest HW counter memory resides in a per-chip reserve-memory (HOMER).
525          * Get the base memory address for this cpu.
526          */
527         chip_id = cpu_to_chip_id(event->cpu);
528
529         /* Return, if chip_id is not valid */
530         if (chip_id < 0)
531                 return -ENODEV;
532
533         pcni = pmu->mem_info;
534         do {
535                 if (pcni->id == chip_id) {
536                         flag = true;
537                         break;
538                 }
539                 pcni++;
540         } while (pcni->vbase != 0);
541
542         if (!flag)
543                 return -ENODEV;
544
545         /*
546          * Add the event offset to the base address.
547          */
548         l_config = config & IMC_EVENT_OFFSET_MASK;
549         event->hw.event_base = (u64)pcni->vbase + l_config;
550         node_id = cpu_to_node(event->cpu);
551
552         /*
553          * Get the imc_pmu_ref struct for this node.
554          * Take the mutex lock and then increment the count of nest pmu events
555          * inited.
556          */
557         ref = get_nest_pmu_ref(event->cpu);
558         if (!ref)
559                 return -EINVAL;
560
561         mutex_lock(&ref->lock);
562         if (ref->refc == 0) {
563                 rc = opal_imc_counters_start(OPAL_IMC_COUNTERS_NEST,
564                                              get_hard_smp_processor_id(event->cpu));
565                 if (rc) {
566                         mutex_unlock(&ref->lock);
567                         pr_err("nest-imc: Unable to start the counters for node %d\n",
568                                                                         node_id);
569                         return rc;
570                 }
571         }
572         ++ref->refc;
573         mutex_unlock(&ref->lock);
574
575         event->destroy = nest_imc_counters_release;
576         return 0;
577 }
578
579 /*
580  * core_imc_mem_init : Initializes memory for the current core.
581  *
582  * Uses alloc_pages_node() and uses the returned address as an argument to
583  * an opal call to configure the pdbar. The address sent as an argument is
584  * converted to physical address before the opal call is made. This is the
585  * base address at which the core imc counters are populated.
586  */
587 static int core_imc_mem_init(int cpu, int size)
588 {
589         int nid, rc = 0, core_id = (cpu / threads_per_core);
590         struct imc_mem_info *mem_info;
591         struct page *page;
592
593         /*
594          * alloc_pages_node() will allocate memory for core in the
595          * local node only.
596          */
597         nid = cpu_to_node(cpu);
598         mem_info = &core_imc_pmu->mem_info[core_id];
599         mem_info->id = core_id;
600
601         /* We need only vbase for core counters */
602         page = alloc_pages_node(nid,
603                                 GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE |
604                                 __GFP_NOWARN, get_order(size));
605         if (!page)
606                 return -ENOMEM;
607         mem_info->vbase = page_address(page);
608
609         /* Init the mutex */
610         core_imc_refc[core_id].id = core_id;
611         mutex_init(&core_imc_refc[core_id].lock);
612
613         rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_CORE,
614                                 __pa((void *)mem_info->vbase),
615                                 get_hard_smp_processor_id(cpu));
616         if (rc) {
617                 free_pages((u64)mem_info->vbase, get_order(size));
618                 mem_info->vbase = NULL;
619         }
620
621         return rc;
622 }
623
624 static bool is_core_imc_mem_inited(int cpu)
625 {
626         struct imc_mem_info *mem_info;
627         int core_id = (cpu / threads_per_core);
628
629         mem_info = &core_imc_pmu->mem_info[core_id];
630         if (!mem_info->vbase)
631                 return false;
632
633         return true;
634 }
635
636 static int ppc_core_imc_cpu_online(unsigned int cpu)
637 {
638         const struct cpumask *l_cpumask;
639         static struct cpumask tmp_mask;
640         int ret = 0;
641
642         /* Get the cpumask for this core */
643         l_cpumask = cpu_sibling_mask(cpu);
644
645         /* If a cpu for this core is already set, then, don't do anything */
646         if (cpumask_and(&tmp_mask, l_cpumask, &core_imc_cpumask))
647                 return 0;
648
649         if (!is_core_imc_mem_inited(cpu)) {
650                 ret = core_imc_mem_init(cpu, core_imc_pmu->counter_mem_size);
651                 if (ret) {
652                         pr_info("core_imc memory allocation for cpu %d failed\n", cpu);
653                         return ret;
654                 }
655         }
656
657         /* set the cpu in the mask */
658         cpumask_set_cpu(cpu, &core_imc_cpumask);
659         return 0;
660 }
661
662 static int ppc_core_imc_cpu_offline(unsigned int cpu)
663 {
664         unsigned int core_id;
665         int ncpu;
666         struct imc_pmu_ref *ref;
667
668         /*
669          * clear this cpu out of the mask, if not present in the mask,
670          * don't bother doing anything.
671          */
672         if (!cpumask_test_and_clear_cpu(cpu, &core_imc_cpumask))
673                 return 0;
674
675         /*
676          * Check whether core_imc is registered. We could end up here
677          * if the cpuhotplug callback registration fails. i.e, callback
678          * invokes the offline path for all successfully registered cpus.
679          * At this stage, core_imc pmu will not be registered and we
680          * should return here.
681          *
682          * We return with a zero since this is not an offline failure.
683          * And cpuhp_setup_state() returns the actual failure reason
684          * to the caller, which inturn will call the cleanup routine.
685          */
686         if (!core_imc_pmu->pmu.event_init)
687                 return 0;
688
689         /* Find any online cpu in that core except the current "cpu" */
690         ncpu = cpumask_last(cpu_sibling_mask(cpu));
691
692         if (unlikely(ncpu == cpu))
693                 ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu);
694
695         if (ncpu >= 0 && ncpu < nr_cpu_ids) {
696                 cpumask_set_cpu(ncpu, &core_imc_cpumask);
697                 perf_pmu_migrate_context(&core_imc_pmu->pmu, cpu, ncpu);
698         } else {
699                 /*
700                  * If this is the last cpu in this core then, skip taking refernce
701                  * count mutex lock for this core and directly zero "refc" for
702                  * this core.
703                  */
704                 opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE,
705                                        get_hard_smp_processor_id(cpu));
706                 core_id = cpu / threads_per_core;
707                 ref = &core_imc_refc[core_id];
708                 if (!ref)
709                         return -EINVAL;
710
711                 ref->refc = 0;
712                 /*
713                  * Reduce the global reference count, if this is the
714                  * last cpu in this core and core-imc event running
715                  * in this cpu.
716                  */
717                 mutex_lock(&imc_global_refc.lock);
718                 if (imc_global_refc.id == IMC_DOMAIN_CORE)
719                         imc_global_refc.refc--;
720
721                 mutex_unlock(&imc_global_refc.lock);
722         }
723         return 0;
724 }
725
726 static int core_imc_pmu_cpumask_init(void)
727 {
728         return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE,
729                                  "perf/powerpc/imc_core:online",
730                                  ppc_core_imc_cpu_online,
731                                  ppc_core_imc_cpu_offline);
732 }
733
734 static void reset_global_refc(struct perf_event *event)
735 {
736                 mutex_lock(&imc_global_refc.lock);
737                 imc_global_refc.refc--;
738
739                 /*
740                  * If no other thread is running any
741                  * event for this domain(thread/core/trace),
742                  * set the global id to zero.
743                  */
744                 if (imc_global_refc.refc <= 0) {
745                         imc_global_refc.refc = 0;
746                         imc_global_refc.id = 0;
747                 }
748                 mutex_unlock(&imc_global_refc.lock);
749 }
750
751 static void core_imc_counters_release(struct perf_event *event)
752 {
753         int rc, core_id;
754         struct imc_pmu_ref *ref;
755
756         if (event->cpu < 0)
757                 return;
758         /*
759          * See if we need to disable the IMC PMU.
760          * If no events are currently in use, then we have to take a
761          * mutex to ensure that we don't race with another task doing
762          * enable or disable the core counters.
763          */
764         core_id = event->cpu / threads_per_core;
765
766         /* Take the mutex lock and decrement the refernce count for this core */
767         ref = &core_imc_refc[core_id];
768         if (!ref)
769                 return;
770
771         mutex_lock(&ref->lock);
772         if (ref->refc == 0) {
773                 /*
774                  * The scenario where this is true is, when perf session is
775                  * started, followed by offlining of all cpus in a given core.
776                  *
777                  * In the cpuhotplug offline path, ppc_core_imc_cpu_offline()
778                  * function set the ref->count to zero, if the cpu which is
779                  * about to offline is the last cpu in a given core and make
780                  * an OPAL call to disable the engine in that core.
781                  *
782                  */
783                 mutex_unlock(&ref->lock);
784                 return;
785         }
786         ref->refc--;
787         if (ref->refc == 0) {
788                 rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE,
789                                             get_hard_smp_processor_id(event->cpu));
790                 if (rc) {
791                         mutex_unlock(&ref->lock);
792                         pr_err("IMC: Unable to stop the counters for core %d\n", core_id);
793                         return;
794                 }
795         } else if (ref->refc < 0) {
796                 WARN(1, "core-imc: Invalid event reference count\n");
797                 ref->refc = 0;
798         }
799         mutex_unlock(&ref->lock);
800
801         reset_global_refc(event);
802 }
803
804 static int core_imc_event_init(struct perf_event *event)
805 {
806         int core_id, rc;
807         u64 config = event->attr.config;
808         struct imc_mem_info *pcmi;
809         struct imc_pmu *pmu;
810         struct imc_pmu_ref *ref;
811
812         if (event->attr.type != event->pmu->type)
813                 return -ENOENT;
814
815         /* Sampling not supported */
816         if (event->hw.sample_period)
817                 return -EINVAL;
818
819         if (event->cpu < 0)
820                 return -EINVAL;
821
822         event->hw.idx = -1;
823         pmu = imc_event_to_pmu(event);
824
825         /* Sanity check for config (event offset) */
826         if (((config & IMC_EVENT_OFFSET_MASK) > pmu->counter_mem_size))
827                 return -EINVAL;
828
829         if (!is_core_imc_mem_inited(event->cpu))
830                 return -ENODEV;
831
832         core_id = event->cpu / threads_per_core;
833         pcmi = &core_imc_pmu->mem_info[core_id];
834         if ((!pcmi->vbase))
835                 return -ENODEV;
836
837         /* Get the core_imc mutex for this core */
838         ref = &core_imc_refc[core_id];
839         if (!ref)
840                 return -EINVAL;
841
842         /*
843          * Core pmu units are enabled only when it is used.
844          * See if this is triggered for the first time.
845          * If yes, take the mutex lock and enable the core counters.
846          * If not, just increment the count in core_imc_refc struct.
847          */
848         mutex_lock(&ref->lock);
849         if (ref->refc == 0) {
850                 rc = opal_imc_counters_start(OPAL_IMC_COUNTERS_CORE,
851                                              get_hard_smp_processor_id(event->cpu));
852                 if (rc) {
853                         mutex_unlock(&ref->lock);
854                         pr_err("core-imc: Unable to start the counters for core %d\n",
855                                                                         core_id);
856                         return rc;
857                 }
858         }
859         ++ref->refc;
860         mutex_unlock(&ref->lock);
861
862         /*
863          * Since the system can run either in accumulation or trace-mode
864          * of IMC at a time, core-imc events are allowed only if no other
865          * trace/thread imc events are enabled/monitored.
866          *
867          * Take the global lock, and check the refc.id
868          * to know whether any other trace/thread imc
869          * events are running.
870          */
871         mutex_lock(&imc_global_refc.lock);
872         if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_CORE) {
873                 /*
874                  * No other trace/thread imc events are running in
875                  * the system, so set the refc.id to core-imc.
876                  */
877                 imc_global_refc.id = IMC_DOMAIN_CORE;
878                 imc_global_refc.refc++;
879         } else {
880                 mutex_unlock(&imc_global_refc.lock);
881                 return -EBUSY;
882         }
883         mutex_unlock(&imc_global_refc.lock);
884
885         event->hw.event_base = (u64)pcmi->vbase + (config & IMC_EVENT_OFFSET_MASK);
886         event->destroy = core_imc_counters_release;
887         return 0;
888 }
889
890 /*
891  * Allocates a page of memory for each of the online cpus, and load
892  * LDBAR with 0.
893  * The physical base address of the page allocated for a cpu will be
894  * written to the LDBAR for that cpu, when the thread-imc event
895  * is added.
896  *
897  * LDBAR Register Layout:
898  *
899  *  0          4         8         12        16        20        24        28
900  * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
901  *   | |       [   ]    [                   Counter Address [8:50]
902  *   | * Mode    |
903  *   |           * PB Scope
904  *   * Enable/Disable
905  *
906  *  32        36        40        44        48        52        56        60
907  * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
908  *           Counter Address [8:50]              ]
909  *
910  */
911 static int thread_imc_mem_alloc(int cpu_id, int size)
912 {
913         u64 *local_mem = per_cpu(thread_imc_mem, cpu_id);
914         int nid = cpu_to_node(cpu_id);
915
916         if (!local_mem) {
917                 struct page *page;
918                 /*
919                  * This case could happen only once at start, since we dont
920                  * free the memory in cpu offline path.
921                  */
922                 page = alloc_pages_node(nid,
923                                   GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE |
924                                   __GFP_NOWARN, get_order(size));
925                 if (!page)
926                         return -ENOMEM;
927                 local_mem = page_address(page);
928
929                 per_cpu(thread_imc_mem, cpu_id) = local_mem;
930         }
931
932         mtspr(SPRN_LDBAR, 0);
933         return 0;
934 }
935
936 static int ppc_thread_imc_cpu_online(unsigned int cpu)
937 {
938         return thread_imc_mem_alloc(cpu, thread_imc_mem_size);
939 }
940
941 static int ppc_thread_imc_cpu_offline(unsigned int cpu)
942 {
943         /*
944          * Set the bit 0 of LDBAR to zero.
945          *
946          * If bit 0 of LDBAR is unset, it will stop posting
947          * the counter data to memory.
948          * For thread-imc, bit 0 of LDBAR will be set to 1 in the
949          * event_add function. So reset this bit here, to stop the updates
950          * to memory in the cpu_offline path.
951          */
952         mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) & (~(1UL << 63))));
953
954         /* Reduce the refc if thread-imc event running on this cpu */
955         mutex_lock(&imc_global_refc.lock);
956         if (imc_global_refc.id == IMC_DOMAIN_THREAD)
957                 imc_global_refc.refc--;
958         mutex_unlock(&imc_global_refc.lock);
959
960         return 0;
961 }
962
963 static int thread_imc_cpu_init(void)
964 {
965         return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE,
966                           "perf/powerpc/imc_thread:online",
967                           ppc_thread_imc_cpu_online,
968                           ppc_thread_imc_cpu_offline);
969 }
970
971 static int thread_imc_event_init(struct perf_event *event)
972 {
973         u32 config = event->attr.config;
974         struct task_struct *target;
975         struct imc_pmu *pmu;
976
977         if (event->attr.type != event->pmu->type)
978                 return -ENOENT;
979
980         if (!perfmon_capable())
981                 return -EACCES;
982
983         /* Sampling not supported */
984         if (event->hw.sample_period)
985                 return -EINVAL;
986
987         event->hw.idx = -1;
988         pmu = imc_event_to_pmu(event);
989
990         /* Sanity check for config offset */
991         if (((config & IMC_EVENT_OFFSET_MASK) > pmu->counter_mem_size))
992                 return -EINVAL;
993
994         target = event->hw.target;
995         if (!target)
996                 return -EINVAL;
997
998         mutex_lock(&imc_global_refc.lock);
999         /*
1000          * Check if any other trace/core imc events are running in the
1001          * system, if not set the global id to thread-imc.
1002          */
1003         if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_THREAD) {
1004                 imc_global_refc.id = IMC_DOMAIN_THREAD;
1005                 imc_global_refc.refc++;
1006         } else {
1007                 mutex_unlock(&imc_global_refc.lock);
1008                 return -EBUSY;
1009         }
1010         mutex_unlock(&imc_global_refc.lock);
1011
1012         event->pmu->task_ctx_nr = perf_sw_context;
1013         event->destroy = reset_global_refc;
1014         return 0;
1015 }
1016
1017 static bool is_thread_imc_pmu(struct perf_event *event)
1018 {
1019         if (!strncmp(event->pmu->name, "thread_imc", strlen("thread_imc")))
1020                 return true;
1021
1022         return false;
1023 }
1024
1025 static u64 * get_event_base_addr(struct perf_event *event)
1026 {
1027         u64 addr;
1028
1029         if (is_thread_imc_pmu(event)) {
1030                 addr = (u64)per_cpu(thread_imc_mem, smp_processor_id());
1031                 return (u64 *)(addr + (event->attr.config & IMC_EVENT_OFFSET_MASK));
1032         }
1033
1034         return (u64 *)event->hw.event_base;
1035 }
1036
1037 static void thread_imc_pmu_start_txn(struct pmu *pmu,
1038                                      unsigned int txn_flags)
1039 {
1040         if (txn_flags & ~PERF_PMU_TXN_ADD)
1041                 return;
1042         perf_pmu_disable(pmu);
1043 }
1044
1045 static void thread_imc_pmu_cancel_txn(struct pmu *pmu)
1046 {
1047         perf_pmu_enable(pmu);
1048 }
1049
1050 static int thread_imc_pmu_commit_txn(struct pmu *pmu)
1051 {
1052         perf_pmu_enable(pmu);
1053         return 0;
1054 }
1055
1056 static u64 imc_read_counter(struct perf_event *event)
1057 {
1058         u64 *addr, data;
1059
1060         /*
1061          * In-Memory Collection (IMC) counters are free flowing counters.
1062          * So we take a snapshot of the counter value on enable and save it
1063          * to calculate the delta at later stage to present the event counter
1064          * value.
1065          */
1066         addr = get_event_base_addr(event);
1067         data = be64_to_cpu(READ_ONCE(*addr));
1068         local64_set(&event->hw.prev_count, data);
1069
1070         return data;
1071 }
1072
1073 static void imc_event_update(struct perf_event *event)
1074 {
1075         u64 counter_prev, counter_new, final_count;
1076
1077         counter_prev = local64_read(&event->hw.prev_count);
1078         counter_new = imc_read_counter(event);
1079         final_count = counter_new - counter_prev;
1080
1081         /* Update the delta to the event count */
1082         local64_add(final_count, &event->count);
1083 }
1084
1085 static void imc_event_start(struct perf_event *event, int flags)
1086 {
1087         /*
1088          * In Memory Counters are free flowing counters. HW or the microcode
1089          * keeps adding to the counter offset in memory. To get event
1090          * counter value, we snapshot the value here and we calculate
1091          * delta at later point.
1092          */
1093         imc_read_counter(event);
1094 }
1095
1096 static void imc_event_stop(struct perf_event *event, int flags)
1097 {
1098         /*
1099          * Take a snapshot and calculate the delta and update
1100          * the event counter values.
1101          */
1102         imc_event_update(event);
1103 }
1104
1105 static int imc_event_add(struct perf_event *event, int flags)
1106 {
1107         if (flags & PERF_EF_START)
1108                 imc_event_start(event, flags);
1109
1110         return 0;
1111 }
1112
1113 static int thread_imc_event_add(struct perf_event *event, int flags)
1114 {
1115         int core_id;
1116         struct imc_pmu_ref *ref;
1117         u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, smp_processor_id());
1118
1119         if (flags & PERF_EF_START)
1120                 imc_event_start(event, flags);
1121
1122         if (!is_core_imc_mem_inited(smp_processor_id()))
1123                 return -EINVAL;
1124
1125         core_id = smp_processor_id() / threads_per_core;
1126         ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | THREAD_IMC_ENABLE;
1127         mtspr(SPRN_LDBAR, ldbar_value);
1128
1129         /*
1130          * imc pmus are enabled only when it is used.
1131          * See if this is triggered for the first time.
1132          * If yes, take the mutex lock and enable the counters.
1133          * If not, just increment the count in ref count struct.
1134          */
1135         ref = &core_imc_refc[core_id];
1136         if (!ref)
1137                 return -EINVAL;
1138
1139         mutex_lock(&ref->lock);
1140         if (ref->refc == 0) {
1141                 if (opal_imc_counters_start(OPAL_IMC_COUNTERS_CORE,
1142                     get_hard_smp_processor_id(smp_processor_id()))) {
1143                         mutex_unlock(&ref->lock);
1144                         pr_err("thread-imc: Unable to start the counter\
1145                                 for core %d\n", core_id);
1146                         return -EINVAL;
1147                 }
1148         }
1149         ++ref->refc;
1150         mutex_unlock(&ref->lock);
1151         return 0;
1152 }
1153
1154 static void thread_imc_event_del(struct perf_event *event, int flags)
1155 {
1156
1157         int core_id;
1158         struct imc_pmu_ref *ref;
1159
1160         core_id = smp_processor_id() / threads_per_core;
1161         ref = &core_imc_refc[core_id];
1162         if (!ref) {
1163                 pr_debug("imc: Failed to get event reference count\n");
1164                 return;
1165         }
1166
1167         mutex_lock(&ref->lock);
1168         ref->refc--;
1169         if (ref->refc == 0) {
1170                 if (opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE,
1171                     get_hard_smp_processor_id(smp_processor_id()))) {
1172                         mutex_unlock(&ref->lock);
1173                         pr_err("thread-imc: Unable to stop the counters\
1174                                 for core %d\n", core_id);
1175                         return;
1176                 }
1177         } else if (ref->refc < 0) {
1178                 ref->refc = 0;
1179         }
1180         mutex_unlock(&ref->lock);
1181
1182         /* Set bit 0 of LDBAR to zero, to stop posting updates to memory */
1183         mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) & (~(1UL << 63))));
1184
1185         /*
1186          * Take a snapshot and calculate the delta and update
1187          * the event counter values.
1188          */
1189         imc_event_update(event);
1190 }
1191
1192 /*
1193  * Allocate a page of memory for each cpu, and load LDBAR with 0.
1194  */
1195 static int trace_imc_mem_alloc(int cpu_id, int size)
1196 {
1197         u64 *local_mem = per_cpu(trace_imc_mem, cpu_id);
1198         int phys_id = cpu_to_node(cpu_id), rc = 0;
1199         int core_id = (cpu_id / threads_per_core);
1200
1201         if (!local_mem) {
1202                 struct page *page;
1203
1204                 page = alloc_pages_node(phys_id,
1205                                 GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE |
1206                                 __GFP_NOWARN, get_order(size));
1207                 if (!page)
1208                         return -ENOMEM;
1209                 local_mem = page_address(page);
1210                 per_cpu(trace_imc_mem, cpu_id) = local_mem;
1211
1212                 /* Initialise the counters for trace mode */
1213                 rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_TRACE, __pa((void *)local_mem),
1214                                             get_hard_smp_processor_id(cpu_id));
1215                 if (rc) {
1216                         pr_info("IMC:opal init failed for trace imc\n");
1217                         return rc;
1218                 }
1219         }
1220
1221         /* Init the mutex, if not already */
1222         trace_imc_refc[core_id].id = core_id;
1223         mutex_init(&trace_imc_refc[core_id].lock);
1224
1225         mtspr(SPRN_LDBAR, 0);
1226         return 0;
1227 }
1228
1229 static int ppc_trace_imc_cpu_online(unsigned int cpu)
1230 {
1231         return trace_imc_mem_alloc(cpu, trace_imc_mem_size);
1232 }
1233
1234 static int ppc_trace_imc_cpu_offline(unsigned int cpu)
1235 {
1236         /*
1237          * No need to set bit 0 of LDBAR to zero, as
1238          * it is set to zero for imc trace-mode
1239          *
1240          * Reduce the refc if any trace-imc event running
1241          * on this cpu.
1242          */
1243         mutex_lock(&imc_global_refc.lock);
1244         if (imc_global_refc.id == IMC_DOMAIN_TRACE)
1245                 imc_global_refc.refc--;
1246         mutex_unlock(&imc_global_refc.lock);
1247
1248         return 0;
1249 }
1250
1251 static int trace_imc_cpu_init(void)
1252 {
1253         return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE,
1254                           "perf/powerpc/imc_trace:online",
1255                           ppc_trace_imc_cpu_online,
1256                           ppc_trace_imc_cpu_offline);
1257 }
1258
1259 static u64 get_trace_imc_event_base_addr(void)
1260 {
1261         return (u64)per_cpu(trace_imc_mem, smp_processor_id());
1262 }
1263
1264 /*
1265  * Function to parse trace-imc data obtained
1266  * and to prepare the perf sample.
1267  */
1268 static int trace_imc_prepare_sample(struct trace_imc_data *mem,
1269                                     struct perf_sample_data *data,
1270                                     u64 *prev_tb,
1271                                     struct perf_event_header *header,
1272                                     struct perf_event *event)
1273 {
1274         /* Sanity checks for a valid record */
1275         if (be64_to_cpu(READ_ONCE(mem->tb1)) > *prev_tb)
1276                 *prev_tb = be64_to_cpu(READ_ONCE(mem->tb1));
1277         else
1278                 return -EINVAL;
1279
1280         if ((be64_to_cpu(READ_ONCE(mem->tb1)) & IMC_TRACE_RECORD_TB1_MASK) !=
1281                          be64_to_cpu(READ_ONCE(mem->tb2)))
1282                 return -EINVAL;
1283
1284         /* Prepare perf sample */
1285         data->ip =  be64_to_cpu(READ_ONCE(mem->ip));
1286         data->period = event->hw.last_period;
1287
1288         header->type = PERF_RECORD_SAMPLE;
1289         header->size = sizeof(*header) + event->header_size;
1290         header->misc = 0;
1291
1292         if (cpu_has_feature(CPU_FTR_ARCH_31)) {
1293                 switch (IMC_TRACE_RECORD_VAL_HVPR(be64_to_cpu(READ_ONCE(mem->val)))) {
1294                 case 0:/* when MSR HV and PR not set in the trace-record */
1295                         header->misc |= PERF_RECORD_MISC_GUEST_KERNEL;
1296                         break;
1297                 case 1: /* MSR HV is 0 and PR is 1 */
1298                         header->misc |= PERF_RECORD_MISC_GUEST_USER;
1299                         break;
1300                 case 2: /* MSR HV is 1 and PR is 0 */
1301                         header->misc |= PERF_RECORD_MISC_KERNEL;
1302                         break;
1303                 case 3: /* MSR HV is 1 and PR is 1 */
1304                         header->misc |= PERF_RECORD_MISC_USER;
1305                         break;
1306                 default:
1307                         pr_info("IMC: Unable to set the flag based on MSR bits\n");
1308                         break;
1309                 }
1310         } else {
1311                 if (is_kernel_addr(data->ip))
1312                         header->misc |= PERF_RECORD_MISC_KERNEL;
1313                 else
1314                         header->misc |= PERF_RECORD_MISC_USER;
1315         }
1316         perf_event_header__init_id(header, data, event);
1317
1318         return 0;
1319 }
1320
1321 static void dump_trace_imc_data(struct perf_event *event)
1322 {
1323         struct trace_imc_data *mem;
1324         int i, ret;
1325         u64 prev_tb = 0;
1326
1327         mem = (struct trace_imc_data *)get_trace_imc_event_base_addr();
1328         for (i = 0; i < (trace_imc_mem_size / sizeof(struct trace_imc_data));
1329                 i++, mem++) {
1330                 struct perf_sample_data data;
1331                 struct perf_event_header header;
1332
1333                 ret = trace_imc_prepare_sample(mem, &data, &prev_tb, &header, event);
1334                 if (ret) /* Exit, if not a valid record */
1335                         break;
1336                 else {
1337                         /* If this is a valid record, create the sample */
1338                         struct perf_output_handle handle;
1339
1340                         if (perf_output_begin(&handle, &data, event, header.size))
1341                                 return;
1342
1343                         perf_output_sample(&handle, &header, &data, event);
1344                         perf_output_end(&handle);
1345                 }
1346         }
1347 }
1348
1349 static int trace_imc_event_add(struct perf_event *event, int flags)
1350 {
1351         int core_id = smp_processor_id() / threads_per_core;
1352         struct imc_pmu_ref *ref = NULL;
1353         u64 local_mem, ldbar_value;
1354
1355         /* Set trace-imc bit in ldbar and load ldbar with per-thread memory address */
1356         local_mem = get_trace_imc_event_base_addr();
1357         ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | TRACE_IMC_ENABLE;
1358
1359         /* trace-imc reference count */
1360         if (trace_imc_refc)
1361                 ref = &trace_imc_refc[core_id];
1362         if (!ref) {
1363                 pr_debug("imc: Failed to get the event reference count\n");
1364                 return -EINVAL;
1365         }
1366
1367         mtspr(SPRN_LDBAR, ldbar_value);
1368         mutex_lock(&ref->lock);
1369         if (ref->refc == 0) {
1370                 if (opal_imc_counters_start(OPAL_IMC_COUNTERS_TRACE,
1371                                 get_hard_smp_processor_id(smp_processor_id()))) {
1372                         mutex_unlock(&ref->lock);
1373                         pr_err("trace-imc: Unable to start the counters for core %d\n", core_id);
1374                         return -EINVAL;
1375                 }
1376         }
1377         ++ref->refc;
1378         mutex_unlock(&ref->lock);
1379         return 0;
1380 }
1381
1382 static void trace_imc_event_read(struct perf_event *event)
1383 {
1384         return;
1385 }
1386
1387 static void trace_imc_event_stop(struct perf_event *event, int flags)
1388 {
1389         u64 local_mem = get_trace_imc_event_base_addr();
1390         dump_trace_imc_data(event);
1391         memset((void *)local_mem, 0, sizeof(u64));
1392 }
1393
1394 static void trace_imc_event_start(struct perf_event *event, int flags)
1395 {
1396         return;
1397 }
1398
1399 static void trace_imc_event_del(struct perf_event *event, int flags)
1400 {
1401         int core_id = smp_processor_id() / threads_per_core;
1402         struct imc_pmu_ref *ref = NULL;
1403
1404         if (trace_imc_refc)
1405                 ref = &trace_imc_refc[core_id];
1406         if (!ref) {
1407                 pr_debug("imc: Failed to get event reference count\n");
1408                 return;
1409         }
1410
1411         mutex_lock(&ref->lock);
1412         ref->refc--;
1413         if (ref->refc == 0) {
1414                 if (opal_imc_counters_stop(OPAL_IMC_COUNTERS_TRACE,
1415                                 get_hard_smp_processor_id(smp_processor_id()))) {
1416                         mutex_unlock(&ref->lock);
1417                         pr_err("trace-imc: Unable to stop the counters for core %d\n", core_id);
1418                         return;
1419                 }
1420         } else if (ref->refc < 0) {
1421                 ref->refc = 0;
1422         }
1423         mutex_unlock(&ref->lock);
1424
1425         trace_imc_event_stop(event, flags);
1426 }
1427
1428 static int trace_imc_event_init(struct perf_event *event)
1429 {
1430         if (event->attr.type != event->pmu->type)
1431                 return -ENOENT;
1432
1433         if (!perfmon_capable())
1434                 return -EACCES;
1435
1436         /* Return if this is a couting event */
1437         if (event->attr.sample_period == 0)
1438                 return -ENOENT;
1439
1440         /*
1441          * Take the global lock, and make sure
1442          * no other thread is running any core/thread imc
1443          * events
1444          */
1445         mutex_lock(&imc_global_refc.lock);
1446         if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_TRACE) {
1447                 /*
1448                  * No core/thread imc events are running in the
1449                  * system, so set the refc.id to trace-imc.
1450                  */
1451                 imc_global_refc.id = IMC_DOMAIN_TRACE;
1452                 imc_global_refc.refc++;
1453         } else {
1454                 mutex_unlock(&imc_global_refc.lock);
1455                 return -EBUSY;
1456         }
1457         mutex_unlock(&imc_global_refc.lock);
1458
1459         event->hw.idx = -1;
1460
1461         /*
1462          * There can only be a single PMU for perf_hw_context events which is assigned to
1463          * core PMU. Hence use "perf_sw_context" for trace_imc.
1464          */
1465         event->pmu->task_ctx_nr = perf_sw_context;
1466         event->destroy = reset_global_refc;
1467         return 0;
1468 }
1469
1470 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
1471 static int update_pmu_ops(struct imc_pmu *pmu)
1472 {
1473         pmu->pmu.task_ctx_nr = perf_invalid_context;
1474         pmu->pmu.add = imc_event_add;
1475         pmu->pmu.del = imc_event_stop;
1476         pmu->pmu.start = imc_event_start;
1477         pmu->pmu.stop = imc_event_stop;
1478         pmu->pmu.read = imc_event_update;
1479         pmu->pmu.attr_groups = pmu->attr_groups;
1480         pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
1481         pmu->attr_groups[IMC_FORMAT_ATTR] = &imc_format_group;
1482
1483         switch (pmu->domain) {
1484         case IMC_DOMAIN_NEST:
1485                 pmu->pmu.event_init = nest_imc_event_init;
1486                 pmu->attr_groups[IMC_CPUMASK_ATTR] = &imc_pmu_cpumask_attr_group;
1487                 break;
1488         case IMC_DOMAIN_CORE:
1489                 pmu->pmu.event_init = core_imc_event_init;
1490                 pmu->attr_groups[IMC_CPUMASK_ATTR] = &imc_pmu_cpumask_attr_group;
1491                 break;
1492         case IMC_DOMAIN_THREAD:
1493                 pmu->pmu.event_init = thread_imc_event_init;
1494                 pmu->pmu.add = thread_imc_event_add;
1495                 pmu->pmu.del = thread_imc_event_del;
1496                 pmu->pmu.start_txn = thread_imc_pmu_start_txn;
1497                 pmu->pmu.cancel_txn = thread_imc_pmu_cancel_txn;
1498                 pmu->pmu.commit_txn = thread_imc_pmu_commit_txn;
1499                 break;
1500         case IMC_DOMAIN_TRACE:
1501                 pmu->pmu.event_init = trace_imc_event_init;
1502                 pmu->pmu.add = trace_imc_event_add;
1503                 pmu->pmu.del = trace_imc_event_del;
1504                 pmu->pmu.start = trace_imc_event_start;
1505                 pmu->pmu.stop = trace_imc_event_stop;
1506                 pmu->pmu.read = trace_imc_event_read;
1507                 pmu->attr_groups[IMC_FORMAT_ATTR] = &trace_imc_format_group;
1508                 break;
1509         default:
1510                 break;
1511         }
1512
1513         return 0;
1514 }
1515
1516 /* init_nest_pmu_ref: Initialize the imc_pmu_ref struct for all the nodes */
1517 static int init_nest_pmu_ref(void)
1518 {
1519         int nid, i, cpu;
1520
1521         nest_imc_refc = kcalloc(num_possible_nodes(), sizeof(*nest_imc_refc),
1522                                                                 GFP_KERNEL);
1523
1524         if (!nest_imc_refc)
1525                 return -ENOMEM;
1526
1527         i = 0;
1528         for_each_node(nid) {
1529                 /*
1530                  * Mutex lock to avoid races while tracking the number of
1531                  * sessions using the chip's nest pmu units.
1532                  */
1533                 mutex_init(&nest_imc_refc[i].lock);
1534
1535                 /*
1536                  * Loop to init the "id" with the node_id. Variable "i" initialized to
1537                  * 0 and will be used as index to the array. "i" will not go off the
1538                  * end of the array since the "for_each_node" loops for "N_POSSIBLE"
1539                  * nodes only.
1540                  */
1541                 nest_imc_refc[i++].id = nid;
1542         }
1543
1544         /*
1545          * Loop to init the per_cpu "local_nest_imc_refc" with the proper
1546          * "nest_imc_refc" index. This makes get_nest_pmu_ref() alot simple.
1547          */
1548         for_each_possible_cpu(cpu) {
1549                 nid = cpu_to_node(cpu);
1550                 for (i = 0; i < num_possible_nodes(); i++) {
1551                         if (nest_imc_refc[i].id == nid) {
1552                                 per_cpu(local_nest_imc_refc, cpu) = &nest_imc_refc[i];
1553                                 break;
1554                         }
1555                 }
1556         }
1557         return 0;
1558 }
1559
1560 static void cleanup_all_core_imc_memory(void)
1561 {
1562         int i, nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core);
1563         struct imc_mem_info *ptr = core_imc_pmu->mem_info;
1564         int size = core_imc_pmu->counter_mem_size;
1565
1566         /* mem_info will never be NULL */
1567         for (i = 0; i < nr_cores; i++) {
1568                 if (ptr[i].vbase)
1569                         free_pages((u64)ptr[i].vbase, get_order(size));
1570         }
1571
1572         kfree(ptr);
1573         kfree(core_imc_refc);
1574 }
1575
1576 static void thread_imc_ldbar_disable(void *dummy)
1577 {
1578         /*
1579          * By setting 0th bit of LDBAR to zero, we disable thread-imc
1580          * updates to memory.
1581          */
1582         mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) & (~(1UL << 63))));
1583 }
1584
1585 void thread_imc_disable(void)
1586 {
1587         on_each_cpu(thread_imc_ldbar_disable, NULL, 1);
1588 }
1589
1590 static void cleanup_all_thread_imc_memory(void)
1591 {
1592         int i, order = get_order(thread_imc_mem_size);
1593
1594         for_each_online_cpu(i) {
1595                 if (per_cpu(thread_imc_mem, i))
1596                         free_pages((u64)per_cpu(thread_imc_mem, i), order);
1597
1598         }
1599 }
1600
1601 static void cleanup_all_trace_imc_memory(void)
1602 {
1603         int i, order = get_order(trace_imc_mem_size);
1604
1605         for_each_online_cpu(i) {
1606                 if (per_cpu(trace_imc_mem, i))
1607                         free_pages((u64)per_cpu(trace_imc_mem, i), order);
1608
1609         }
1610         kfree(trace_imc_refc);
1611 }
1612
1613 /* Function to free the attr_groups which are dynamically allocated */
1614 static void imc_common_mem_free(struct imc_pmu *pmu_ptr)
1615 {
1616         if (pmu_ptr->attr_groups[IMC_EVENT_ATTR])
1617                 kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs);
1618         kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]);
1619 }
1620
1621 /*
1622  * Common function to unregister cpu hotplug callback and
1623  * free the memory.
1624  * TODO: Need to handle pmu unregistering, which will be
1625  * done in followup series.
1626  */
1627 static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr)
1628 {
1629         if (pmu_ptr->domain == IMC_DOMAIN_NEST) {
1630                 mutex_lock(&nest_init_lock);
1631                 if (nest_pmus == 1) {
1632                         cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE);
1633                         kfree(nest_imc_refc);
1634                         kfree(per_nest_pmu_arr);
1635                         per_nest_pmu_arr = NULL;
1636                 }
1637
1638                 if (nest_pmus > 0)
1639                         nest_pmus--;
1640                 mutex_unlock(&nest_init_lock);
1641         }
1642
1643         /* Free core_imc memory */
1644         if (pmu_ptr->domain == IMC_DOMAIN_CORE) {
1645                 cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE);
1646                 cleanup_all_core_imc_memory();
1647         }
1648
1649         /* Free thread_imc memory */
1650         if (pmu_ptr->domain == IMC_DOMAIN_THREAD) {
1651                 cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE);
1652                 cleanup_all_thread_imc_memory();
1653         }
1654
1655         if (pmu_ptr->domain == IMC_DOMAIN_TRACE) {
1656                 cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE);
1657                 cleanup_all_trace_imc_memory();
1658         }
1659 }
1660
1661 /*
1662  * Function to unregister thread-imc if core-imc
1663  * is not registered.
1664  */
1665 void unregister_thread_imc(void)
1666 {
1667         imc_common_cpuhp_mem_free(thread_imc_pmu);
1668         imc_common_mem_free(thread_imc_pmu);
1669         perf_pmu_unregister(&thread_imc_pmu->pmu);
1670 }
1671
1672 /*
1673  * imc_mem_init : Function to support memory allocation for core imc.
1674  */
1675 static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent,
1676                                                                 int pmu_index)
1677 {
1678         const char *s;
1679         int nr_cores, cpu, res = -ENOMEM;
1680
1681         if (of_property_read_string(parent, "name", &s))
1682                 return -ENODEV;
1683
1684         switch (pmu_ptr->domain) {
1685         case IMC_DOMAIN_NEST:
1686                 /* Update the pmu name */
1687                 pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s_imc", "nest_", s);
1688                 if (!pmu_ptr->pmu.name)
1689                         goto err;
1690
1691                 /* Needed for hotplug/migration */
1692                 if (!per_nest_pmu_arr) {
1693                         per_nest_pmu_arr = kcalloc(get_max_nest_dev() + 1,
1694                                                 sizeof(struct imc_pmu *),
1695                                                 GFP_KERNEL);
1696                         if (!per_nest_pmu_arr)
1697                                 goto err;
1698                 }
1699                 per_nest_pmu_arr[pmu_index] = pmu_ptr;
1700                 break;
1701         case IMC_DOMAIN_CORE:
1702                 /* Update the pmu name */
1703                 pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
1704                 if (!pmu_ptr->pmu.name)
1705                         goto err;
1706
1707                 nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core);
1708                 pmu_ptr->mem_info = kcalloc(nr_cores, sizeof(struct imc_mem_info),
1709                                                                 GFP_KERNEL);
1710
1711                 if (!pmu_ptr->mem_info)
1712                         goto err;
1713
1714                 core_imc_refc = kcalloc(nr_cores, sizeof(struct imc_pmu_ref),
1715                                                                 GFP_KERNEL);
1716
1717                 if (!core_imc_refc) {
1718                         kfree(pmu_ptr->mem_info);
1719                         goto err;
1720                 }
1721
1722                 core_imc_pmu = pmu_ptr;
1723                 break;
1724         case IMC_DOMAIN_THREAD:
1725                 /* Update the pmu name */
1726                 pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
1727                 if (!pmu_ptr->pmu.name)
1728                         goto err;
1729
1730                 thread_imc_mem_size = pmu_ptr->counter_mem_size;
1731                 for_each_online_cpu(cpu) {
1732                         res = thread_imc_mem_alloc(cpu, pmu_ptr->counter_mem_size);
1733                         if (res) {
1734                                 cleanup_all_thread_imc_memory();
1735                                 goto err;
1736                         }
1737                 }
1738
1739                 thread_imc_pmu = pmu_ptr;
1740                 break;
1741         case IMC_DOMAIN_TRACE:
1742                 /* Update the pmu name */
1743                 pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
1744                 if (!pmu_ptr->pmu.name)
1745                         return -ENOMEM;
1746
1747                 nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core);
1748                 trace_imc_refc = kcalloc(nr_cores, sizeof(struct imc_pmu_ref),
1749                                                                 GFP_KERNEL);
1750                 if (!trace_imc_refc)
1751                         return -ENOMEM;
1752
1753                 trace_imc_mem_size = pmu_ptr->counter_mem_size;
1754                 for_each_online_cpu(cpu) {
1755                         res = trace_imc_mem_alloc(cpu, trace_imc_mem_size);
1756                         if (res) {
1757                                 cleanup_all_trace_imc_memory();
1758                                 goto err;
1759                         }
1760                 }
1761                 break;
1762         default:
1763                 return -EINVAL;
1764         }
1765
1766         return 0;
1767 err:
1768         return res;
1769 }
1770
1771 /*
1772  * init_imc_pmu : Setup and register the IMC pmu device.
1773  *
1774  * @parent:     Device tree unit node
1775  * @pmu_ptr:    memory allocated for this pmu
1776  * @pmu_idx:    Count of nest pmc registered
1777  *
1778  * init_imc_pmu() setup pmu cpumask and registers for a cpu hotplug callback.
1779  * Handles failure cases and accordingly frees memory.
1780  */
1781 int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_idx)
1782 {
1783         int ret;
1784
1785         ret = imc_mem_init(pmu_ptr, parent, pmu_idx);
1786         if (ret)
1787                 goto err_free_mem;
1788
1789         switch (pmu_ptr->domain) {
1790         case IMC_DOMAIN_NEST:
1791                 /*
1792                 * Nest imc pmu need only one cpu per chip, we initialize the
1793                 * cpumask for the first nest imc pmu and use the same for the
1794                 * rest. To handle the cpuhotplug callback unregister, we track
1795                 * the number of nest pmus in "nest_pmus".
1796                 */
1797                 mutex_lock(&nest_init_lock);
1798                 if (nest_pmus == 0) {
1799                         ret = init_nest_pmu_ref();
1800                         if (ret) {
1801                                 mutex_unlock(&nest_init_lock);
1802                                 kfree(per_nest_pmu_arr);
1803                                 per_nest_pmu_arr = NULL;
1804                                 goto err_free_mem;
1805                         }
1806                         /* Register for cpu hotplug notification. */
1807                         ret = nest_pmu_cpumask_init();
1808                         if (ret) {
1809                                 mutex_unlock(&nest_init_lock);
1810                                 kfree(nest_imc_refc);
1811                                 kfree(per_nest_pmu_arr);
1812                                 per_nest_pmu_arr = NULL;
1813                                 goto err_free_mem;
1814                         }
1815                 }
1816                 nest_pmus++;
1817                 mutex_unlock(&nest_init_lock);
1818                 break;
1819         case IMC_DOMAIN_CORE:
1820                 ret = core_imc_pmu_cpumask_init();
1821                 if (ret) {
1822                         cleanup_all_core_imc_memory();
1823                         goto err_free_mem;
1824                 }
1825
1826                 break;
1827         case IMC_DOMAIN_THREAD:
1828                 ret = thread_imc_cpu_init();
1829                 if (ret) {
1830                         cleanup_all_thread_imc_memory();
1831                         goto err_free_mem;
1832                 }
1833
1834                 break;
1835         case IMC_DOMAIN_TRACE:
1836                 ret = trace_imc_cpu_init();
1837                 if (ret) {
1838                         cleanup_all_trace_imc_memory();
1839                         goto err_free_mem;
1840                 }
1841
1842                 break;
1843         default:
1844                 return  -EINVAL;        /* Unknown domain */
1845         }
1846
1847         ret = update_events_in_group(parent, pmu_ptr);
1848         if (ret)
1849                 goto err_free_cpuhp_mem;
1850
1851         ret = update_pmu_ops(pmu_ptr);
1852         if (ret)
1853                 goto err_free_cpuhp_mem;
1854
1855         ret = perf_pmu_register(&pmu_ptr->pmu, pmu_ptr->pmu.name, -1);
1856         if (ret)
1857                 goto err_free_cpuhp_mem;
1858
1859         pr_debug("%s performance monitor hardware support registered\n",
1860                                                         pmu_ptr->pmu.name);
1861
1862         return 0;
1863
1864 err_free_cpuhp_mem:
1865         imc_common_cpuhp_mem_free(pmu_ptr);
1866 err_free_mem:
1867         imc_common_mem_free(pmu_ptr);
1868         return ret;
1869 }