GNU Linux-libre 4.14.324-gnu1
[releases.git] / kernel / events / core.c
1 /*
2  * Performance events core code:
3  *
4  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5  *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6  *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
7  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8  *
9  * For licensing details see kernel-base/COPYING
10  */
11
12 #include <linux/fs.h>
13 #include <linux/mm.h>
14 #include <linux/cpu.h>
15 #include <linux/smp.h>
16 #include <linux/idr.h>
17 #include <linux/file.h>
18 #include <linux/poll.h>
19 #include <linux/slab.h>
20 #include <linux/hash.h>
21 #include <linux/tick.h>
22 #include <linux/sysfs.h>
23 #include <linux/dcache.h>
24 #include <linux/percpu.h>
25 #include <linux/ptrace.h>
26 #include <linux/reboot.h>
27 #include <linux/vmstat.h>
28 #include <linux/device.h>
29 #include <linux/export.h>
30 #include <linux/vmalloc.h>
31 #include <linux/hardirq.h>
32 #include <linux/rculist.h>
33 #include <linux/uaccess.h>
34 #include <linux/syscalls.h>
35 #include <linux/anon_inodes.h>
36 #include <linux/kernel_stat.h>
37 #include <linux/cgroup.h>
38 #include <linux/perf_event.h>
39 #include <linux/trace_events.h>
40 #include <linux/hw_breakpoint.h>
41 #include <linux/mm_types.h>
42 #include <linux/module.h>
43 #include <linux/mman.h>
44 #include <linux/compat.h>
45 #include <linux/bpf.h>
46 #include <linux/filter.h>
47 #include <linux/namei.h>
48 #include <linux/parser.h>
49 #include <linux/sched/clock.h>
50 #include <linux/sched/mm.h>
51 #include <linux/proc_ns.h>
52 #include <linux/mount.h>
53
54 #include "internal.h"
55
56 #include <asm/irq_regs.h>
57
58 typedef int (*remote_function_f)(void *);
59
60 struct remote_function_call {
61         struct task_struct      *p;
62         remote_function_f       func;
63         void                    *info;
64         int                     ret;
65 };
66
67 static void remote_function(void *data)
68 {
69         struct remote_function_call *tfc = data;
70         struct task_struct *p = tfc->p;
71
72         if (p) {
73                 /* -EAGAIN */
74                 if (task_cpu(p) != smp_processor_id())
75                         return;
76
77                 /*
78                  * Now that we're on right CPU with IRQs disabled, we can test
79                  * if we hit the right task without races.
80                  */
81
82                 tfc->ret = -ESRCH; /* No such (running) process */
83                 if (p != current)
84                         return;
85         }
86
87         tfc->ret = tfc->func(tfc->info);
88 }
89
90 /**
91  * task_function_call - call a function on the cpu on which a task runs
92  * @p:          the task to evaluate
93  * @func:       the function to be called
94  * @info:       the function call argument
95  *
96  * Calls the function @func when the task is currently running. This might
97  * be on the current CPU, which just calls the function directly.  This will
98  * retry due to any failures in smp_call_function_single(), such as if the
99  * task_cpu() goes offline concurrently.
100  *
101  * returns @func return value or -ESRCH or -ENXIO when the process isn't running
102  */
103 static int
104 task_function_call(struct task_struct *p, remote_function_f func, void *info)
105 {
106         struct remote_function_call data = {
107                 .p      = p,
108                 .func   = func,
109                 .info   = info,
110                 .ret    = -EAGAIN,
111         };
112         int ret;
113
114         for (;;) {
115                 ret = smp_call_function_single(task_cpu(p), remote_function,
116                                                &data, 1);
117                 if (!ret)
118                         ret = data.ret;
119
120                 if (ret != -EAGAIN)
121                         break;
122
123                 cond_resched();
124         }
125
126         return ret;
127 }
128
129 /**
130  * cpu_function_call - call a function on the cpu
131  * @func:       the function to be called
132  * @info:       the function call argument
133  *
134  * Calls the function @func on the remote cpu.
135  *
136  * returns: @func return value or -ENXIO when the cpu is offline
137  */
138 static int cpu_function_call(int cpu, remote_function_f func, void *info)
139 {
140         struct remote_function_call data = {
141                 .p      = NULL,
142                 .func   = func,
143                 .info   = info,
144                 .ret    = -ENXIO, /* No such CPU */
145         };
146
147         smp_call_function_single(cpu, remote_function, &data, 1);
148
149         return data.ret;
150 }
151
152 static inline struct perf_cpu_context *
153 __get_cpu_context(struct perf_event_context *ctx)
154 {
155         return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
156 }
157
158 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
159                           struct perf_event_context *ctx)
160 {
161         raw_spin_lock(&cpuctx->ctx.lock);
162         if (ctx)
163                 raw_spin_lock(&ctx->lock);
164 }
165
166 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
167                             struct perf_event_context *ctx)
168 {
169         if (ctx)
170                 raw_spin_unlock(&ctx->lock);
171         raw_spin_unlock(&cpuctx->ctx.lock);
172 }
173
174 #define TASK_TOMBSTONE ((void *)-1L)
175
176 static bool is_kernel_event(struct perf_event *event)
177 {
178         return READ_ONCE(event->owner) == TASK_TOMBSTONE;
179 }
180
181 /*
182  * On task ctx scheduling...
183  *
184  * When !ctx->nr_events a task context will not be scheduled. This means
185  * we can disable the scheduler hooks (for performance) without leaving
186  * pending task ctx state.
187  *
188  * This however results in two special cases:
189  *
190  *  - removing the last event from a task ctx; this is relatively straight
191  *    forward and is done in __perf_remove_from_context.
192  *
193  *  - adding the first event to a task ctx; this is tricky because we cannot
194  *    rely on ctx->is_active and therefore cannot use event_function_call().
195  *    See perf_install_in_context().
196  *
197  * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
198  */
199
200 typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
201                         struct perf_event_context *, void *);
202
203 struct event_function_struct {
204         struct perf_event *event;
205         event_f func;
206         void *data;
207 };
208
209 static int event_function(void *info)
210 {
211         struct event_function_struct *efs = info;
212         struct perf_event *event = efs->event;
213         struct perf_event_context *ctx = event->ctx;
214         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
215         struct perf_event_context *task_ctx = cpuctx->task_ctx;
216         int ret = 0;
217
218         WARN_ON_ONCE(!irqs_disabled());
219
220         perf_ctx_lock(cpuctx, task_ctx);
221         /*
222          * Since we do the IPI call without holding ctx->lock things can have
223          * changed, double check we hit the task we set out to hit.
224          */
225         if (ctx->task) {
226                 if (ctx->task != current) {
227                         ret = -ESRCH;
228                         goto unlock;
229                 }
230
231                 /*
232                  * We only use event_function_call() on established contexts,
233                  * and event_function() is only ever called when active (or
234                  * rather, we'll have bailed in task_function_call() or the
235                  * above ctx->task != current test), therefore we must have
236                  * ctx->is_active here.
237                  */
238                 WARN_ON_ONCE(!ctx->is_active);
239                 /*
240                  * And since we have ctx->is_active, cpuctx->task_ctx must
241                  * match.
242                  */
243                 WARN_ON_ONCE(task_ctx != ctx);
244         } else {
245                 WARN_ON_ONCE(&cpuctx->ctx != ctx);
246         }
247
248         efs->func(event, cpuctx, ctx, efs->data);
249 unlock:
250         perf_ctx_unlock(cpuctx, task_ctx);
251
252         return ret;
253 }
254
255 static void event_function_call(struct perf_event *event, event_f func, void *data)
256 {
257         struct perf_event_context *ctx = event->ctx;
258         struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
259         struct event_function_struct efs = {
260                 .event = event,
261                 .func = func,
262                 .data = data,
263         };
264
265         if (!event->parent) {
266                 /*
267                  * If this is a !child event, we must hold ctx::mutex to
268                  * stabilize the the event->ctx relation. See
269                  * perf_event_ctx_lock().
270                  */
271                 lockdep_assert_held(&ctx->mutex);
272         }
273
274         if (!task) {
275                 cpu_function_call(event->cpu, event_function, &efs);
276                 return;
277         }
278
279         if (task == TASK_TOMBSTONE)
280                 return;
281
282 again:
283         if (!task_function_call(task, event_function, &efs))
284                 return;
285
286         raw_spin_lock_irq(&ctx->lock);
287         /*
288          * Reload the task pointer, it might have been changed by
289          * a concurrent perf_event_context_sched_out().
290          */
291         task = ctx->task;
292         if (task == TASK_TOMBSTONE) {
293                 raw_spin_unlock_irq(&ctx->lock);
294                 return;
295         }
296         if (ctx->is_active) {
297                 raw_spin_unlock_irq(&ctx->lock);
298                 goto again;
299         }
300         func(event, NULL, ctx, data);
301         raw_spin_unlock_irq(&ctx->lock);
302 }
303
304 /*
305  * Similar to event_function_call() + event_function(), but hard assumes IRQs
306  * are already disabled and we're on the right CPU.
307  */
308 static void event_function_local(struct perf_event *event, event_f func, void *data)
309 {
310         struct perf_event_context *ctx = event->ctx;
311         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
312         struct task_struct *task = READ_ONCE(ctx->task);
313         struct perf_event_context *task_ctx = NULL;
314
315         WARN_ON_ONCE(!irqs_disabled());
316
317         if (task) {
318                 if (task == TASK_TOMBSTONE)
319                         return;
320
321                 task_ctx = ctx;
322         }
323
324         perf_ctx_lock(cpuctx, task_ctx);
325
326         task = ctx->task;
327         if (task == TASK_TOMBSTONE)
328                 goto unlock;
329
330         if (task) {
331                 /*
332                  * We must be either inactive or active and the right task,
333                  * otherwise we're screwed, since we cannot IPI to somewhere
334                  * else.
335                  */
336                 if (ctx->is_active) {
337                         if (WARN_ON_ONCE(task != current))
338                                 goto unlock;
339
340                         if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
341                                 goto unlock;
342                 }
343         } else {
344                 WARN_ON_ONCE(&cpuctx->ctx != ctx);
345         }
346
347         func(event, cpuctx, ctx, data);
348 unlock:
349         perf_ctx_unlock(cpuctx, task_ctx);
350 }
351
352 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
353                        PERF_FLAG_FD_OUTPUT  |\
354                        PERF_FLAG_PID_CGROUP |\
355                        PERF_FLAG_FD_CLOEXEC)
356
357 /*
358  * branch priv levels that need permission checks
359  */
360 #define PERF_SAMPLE_BRANCH_PERM_PLM \
361         (PERF_SAMPLE_BRANCH_KERNEL |\
362          PERF_SAMPLE_BRANCH_HV)
363
364 enum event_type_t {
365         EVENT_FLEXIBLE = 0x1,
366         EVENT_PINNED = 0x2,
367         EVENT_TIME = 0x4,
368         /* see ctx_resched() for details */
369         EVENT_CPU = 0x8,
370         EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
371 };
372
373 /*
374  * perf_sched_events : >0 events exist
375  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
376  */
377
378 static void perf_sched_delayed(struct work_struct *work);
379 DEFINE_STATIC_KEY_FALSE(perf_sched_events);
380 static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
381 static DEFINE_MUTEX(perf_sched_mutex);
382 static atomic_t perf_sched_count;
383
384 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
385 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
386 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
387
388 static atomic_t nr_mmap_events __read_mostly;
389 static atomic_t nr_comm_events __read_mostly;
390 static atomic_t nr_namespaces_events __read_mostly;
391 static atomic_t nr_task_events __read_mostly;
392 static atomic_t nr_freq_events __read_mostly;
393 static atomic_t nr_switch_events __read_mostly;
394
395 static LIST_HEAD(pmus);
396 static DEFINE_MUTEX(pmus_lock);
397 static struct srcu_struct pmus_srcu;
398 static cpumask_var_t perf_online_mask;
399
400 /*
401  * perf event paranoia level:
402  *  -1 - not paranoid at all
403  *   0 - disallow raw tracepoint access for unpriv
404  *   1 - disallow cpu events for unpriv
405  *   2 - disallow kernel profiling for unpriv
406  */
407 int sysctl_perf_event_paranoid __read_mostly = 2;
408
409 /* Minimum for 512 kiB + 1 user control page */
410 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
411
412 /*
413  * max perf event sample rate
414  */
415 #define DEFAULT_MAX_SAMPLE_RATE         100000
416 #define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
417 #define DEFAULT_CPU_TIME_MAX_PERCENT    25
418
419 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
420
421 static int max_samples_per_tick __read_mostly   = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
422 static int perf_sample_period_ns __read_mostly  = DEFAULT_SAMPLE_PERIOD_NS;
423
424 static int perf_sample_allowed_ns __read_mostly =
425         DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
426
427 static void update_perf_cpu_limits(void)
428 {
429         u64 tmp = perf_sample_period_ns;
430
431         tmp *= sysctl_perf_cpu_time_max_percent;
432         tmp = div_u64(tmp, 100);
433         if (!tmp)
434                 tmp = 1;
435
436         WRITE_ONCE(perf_sample_allowed_ns, tmp);
437 }
438
439 static int perf_rotate_context(struct perf_cpu_context *cpuctx);
440
441 int perf_proc_update_handler(struct ctl_table *table, int write,
442                 void __user *buffer, size_t *lenp,
443                 loff_t *ppos)
444 {
445         int ret;
446         int perf_cpu = sysctl_perf_cpu_time_max_percent;
447         /*
448          * If throttling is disabled don't allow the write:
449          */
450         if (write && (perf_cpu == 100 || perf_cpu == 0))
451                 return -EINVAL;
452
453         ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
454         if (ret || !write)
455                 return ret;
456
457         max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
458         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
459         update_perf_cpu_limits();
460
461         return 0;
462 }
463
464 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
465
466 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
467                                 void __user *buffer, size_t *lenp,
468                                 loff_t *ppos)
469 {
470         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
471
472         if (ret || !write)
473                 return ret;
474
475         if (sysctl_perf_cpu_time_max_percent == 100 ||
476             sysctl_perf_cpu_time_max_percent == 0) {
477                 printk(KERN_WARNING
478                        "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
479                 WRITE_ONCE(perf_sample_allowed_ns, 0);
480         } else {
481                 update_perf_cpu_limits();
482         }
483
484         return 0;
485 }
486
487 /*
488  * perf samples are done in some very critical code paths (NMIs).
489  * If they take too much CPU time, the system can lock up and not
490  * get any real work done.  This will drop the sample rate when
491  * we detect that events are taking too long.
492  */
493 #define NR_ACCUMULATED_SAMPLES 128
494 static DEFINE_PER_CPU(u64, running_sample_length);
495
496 static u64 __report_avg;
497 static u64 __report_allowed;
498
499 static void perf_duration_warn(struct irq_work *w)
500 {
501         printk_ratelimited(KERN_INFO
502                 "perf: interrupt took too long (%lld > %lld), lowering "
503                 "kernel.perf_event_max_sample_rate to %d\n",
504                 __report_avg, __report_allowed,
505                 sysctl_perf_event_sample_rate);
506 }
507
508 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
509
510 void perf_sample_event_took(u64 sample_len_ns)
511 {
512         u64 max_len = READ_ONCE(perf_sample_allowed_ns);
513         u64 running_len;
514         u64 avg_len;
515         u32 max;
516
517         if (max_len == 0)
518                 return;
519
520         /* Decay the counter by 1 average sample. */
521         running_len = __this_cpu_read(running_sample_length);
522         running_len -= running_len/NR_ACCUMULATED_SAMPLES;
523         running_len += sample_len_ns;
524         __this_cpu_write(running_sample_length, running_len);
525
526         /*
527          * Note: this will be biased artifically low until we have
528          * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
529          * from having to maintain a count.
530          */
531         avg_len = running_len/NR_ACCUMULATED_SAMPLES;
532         if (avg_len <= max_len)
533                 return;
534
535         __report_avg = avg_len;
536         __report_allowed = max_len;
537
538         /*
539          * Compute a throttle threshold 25% below the current duration.
540          */
541         avg_len += avg_len / 4;
542         max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
543         if (avg_len < max)
544                 max /= (u32)avg_len;
545         else
546                 max = 1;
547
548         WRITE_ONCE(perf_sample_allowed_ns, avg_len);
549         WRITE_ONCE(max_samples_per_tick, max);
550
551         sysctl_perf_event_sample_rate = max * HZ;
552         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
553
554         if (!irq_work_queue(&perf_duration_work)) {
555                 early_printk("perf: interrupt took too long (%lld > %lld), lowering "
556                              "kernel.perf_event_max_sample_rate to %d\n",
557                              __report_avg, __report_allowed,
558                              sysctl_perf_event_sample_rate);
559         }
560 }
561
562 static atomic64_t perf_event_id;
563
564 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
565                               enum event_type_t event_type);
566
567 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
568                              enum event_type_t event_type,
569                              struct task_struct *task);
570
571 static void update_context_time(struct perf_event_context *ctx);
572 static u64 perf_event_time(struct perf_event *event);
573
574 void __weak perf_event_print_debug(void)        { }
575
576 extern __weak const char *perf_pmu_name(void)
577 {
578         return "pmu";
579 }
580
581 static inline u64 perf_clock(void)
582 {
583         return local_clock();
584 }
585
586 static inline u64 perf_event_clock(struct perf_event *event)
587 {
588         return event->clock();
589 }
590
591 #ifdef CONFIG_CGROUP_PERF
592
593 static inline bool
594 perf_cgroup_match(struct perf_event *event)
595 {
596         struct perf_event_context *ctx = event->ctx;
597         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
598
599         /* @event doesn't care about cgroup */
600         if (!event->cgrp)
601                 return true;
602
603         /* wants specific cgroup scope but @cpuctx isn't associated with any */
604         if (!cpuctx->cgrp)
605                 return false;
606
607         /*
608          * Cgroup scoping is recursive.  An event enabled for a cgroup is
609          * also enabled for all its descendant cgroups.  If @cpuctx's
610          * cgroup is a descendant of @event's (the test covers identity
611          * case), it's a match.
612          */
613         return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
614                                     event->cgrp->css.cgroup);
615 }
616
617 static inline void perf_detach_cgroup(struct perf_event *event)
618 {
619         css_put(&event->cgrp->css);
620         event->cgrp = NULL;
621 }
622
623 static inline int is_cgroup_event(struct perf_event *event)
624 {
625         return event->cgrp != NULL;
626 }
627
628 static inline u64 perf_cgroup_event_time(struct perf_event *event)
629 {
630         struct perf_cgroup_info *t;
631
632         t = per_cpu_ptr(event->cgrp->info, event->cpu);
633         return t->time;
634 }
635
636 static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
637 {
638         struct perf_cgroup_info *info;
639         u64 now;
640
641         now = perf_clock();
642
643         info = this_cpu_ptr(cgrp->info);
644
645         info->time += now - info->timestamp;
646         info->timestamp = now;
647 }
648
649 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
650 {
651         struct perf_cgroup *cgrp = cpuctx->cgrp;
652         struct cgroup_subsys_state *css;
653
654         if (cgrp) {
655                 for (css = &cgrp->css; css; css = css->parent) {
656                         cgrp = container_of(css, struct perf_cgroup, css);
657                         __update_cgrp_time(cgrp);
658                 }
659         }
660 }
661
662 static inline void update_cgrp_time_from_event(struct perf_event *event)
663 {
664         struct perf_cgroup *cgrp;
665
666         /*
667          * ensure we access cgroup data only when needed and
668          * when we know the cgroup is pinned (css_get)
669          */
670         if (!is_cgroup_event(event))
671                 return;
672
673         cgrp = perf_cgroup_from_task(current, event->ctx);
674         /*
675          * Do not update time when cgroup is not active
676          */
677        if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
678                 __update_cgrp_time(event->cgrp);
679 }
680
681 static inline void
682 perf_cgroup_set_timestamp(struct task_struct *task,
683                           struct perf_event_context *ctx)
684 {
685         struct perf_cgroup *cgrp;
686         struct perf_cgroup_info *info;
687         struct cgroup_subsys_state *css;
688
689         /*
690          * ctx->lock held by caller
691          * ensure we do not access cgroup data
692          * unless we have the cgroup pinned (css_get)
693          */
694         if (!task || !ctx->nr_cgroups)
695                 return;
696
697         cgrp = perf_cgroup_from_task(task, ctx);
698
699         for (css = &cgrp->css; css; css = css->parent) {
700                 cgrp = container_of(css, struct perf_cgroup, css);
701                 info = this_cpu_ptr(cgrp->info);
702                 info->timestamp = ctx->timestamp;
703         }
704 }
705
706 static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
707
708 #define PERF_CGROUP_SWOUT       0x1 /* cgroup switch out every event */
709 #define PERF_CGROUP_SWIN        0x2 /* cgroup switch in events based on task */
710
711 /*
712  * reschedule events based on the cgroup constraint of task.
713  *
714  * mode SWOUT : schedule out everything
715  * mode SWIN : schedule in based on cgroup for next
716  */
717 static void perf_cgroup_switch(struct task_struct *task, int mode)
718 {
719         struct perf_cpu_context *cpuctx, *tmp;
720         struct list_head *list;
721         unsigned long flags;
722
723         /*
724          * Disable interrupts and preemption to avoid this CPU's
725          * cgrp_cpuctx_entry to change under us.
726          */
727         local_irq_save(flags);
728
729         list = this_cpu_ptr(&cgrp_cpuctx_list);
730         list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
731                 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
732
733                 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
734                 perf_pmu_disable(cpuctx->ctx.pmu);
735
736                 if (mode & PERF_CGROUP_SWOUT) {
737                         cpu_ctx_sched_out(cpuctx, EVENT_ALL);
738                         /*
739                          * must not be done before ctxswout due
740                          * to event_filter_match() in event_sched_out()
741                          */
742                         cpuctx->cgrp = NULL;
743                 }
744
745                 if (mode & PERF_CGROUP_SWIN) {
746                         WARN_ON_ONCE(cpuctx->cgrp);
747                         /*
748                          * set cgrp before ctxsw in to allow
749                          * event_filter_match() to not have to pass
750                          * task around
751                          * we pass the cpuctx->ctx to perf_cgroup_from_task()
752                          * because cgorup events are only per-cpu
753                          */
754                         cpuctx->cgrp = perf_cgroup_from_task(task,
755                                                              &cpuctx->ctx);
756                         cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
757                 }
758                 perf_pmu_enable(cpuctx->ctx.pmu);
759                 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
760         }
761
762         local_irq_restore(flags);
763 }
764
765 static inline void perf_cgroup_sched_out(struct task_struct *task,
766                                          struct task_struct *next)
767 {
768         struct perf_cgroup *cgrp1;
769         struct perf_cgroup *cgrp2 = NULL;
770
771         rcu_read_lock();
772         /*
773          * we come here when we know perf_cgroup_events > 0
774          * we do not need to pass the ctx here because we know
775          * we are holding the rcu lock
776          */
777         cgrp1 = perf_cgroup_from_task(task, NULL);
778         cgrp2 = perf_cgroup_from_task(next, NULL);
779
780         /*
781          * only schedule out current cgroup events if we know
782          * that we are switching to a different cgroup. Otherwise,
783          * do no touch the cgroup events.
784          */
785         if (cgrp1 != cgrp2)
786                 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
787
788         rcu_read_unlock();
789 }
790
791 static inline void perf_cgroup_sched_in(struct task_struct *prev,
792                                         struct task_struct *task)
793 {
794         struct perf_cgroup *cgrp1;
795         struct perf_cgroup *cgrp2 = NULL;
796
797         rcu_read_lock();
798         /*
799          * we come here when we know perf_cgroup_events > 0
800          * we do not need to pass the ctx here because we know
801          * we are holding the rcu lock
802          */
803         cgrp1 = perf_cgroup_from_task(task, NULL);
804         cgrp2 = perf_cgroup_from_task(prev, NULL);
805
806         /*
807          * only need to schedule in cgroup events if we are changing
808          * cgroup during ctxsw. Cgroup events were not scheduled
809          * out of ctxsw out if that was not the case.
810          */
811         if (cgrp1 != cgrp2)
812                 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
813
814         rcu_read_unlock();
815 }
816
817 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
818                                       struct perf_event_attr *attr,
819                                       struct perf_event *group_leader)
820 {
821         struct perf_cgroup *cgrp;
822         struct cgroup_subsys_state *css;
823         struct fd f = fdget(fd);
824         int ret = 0;
825
826         if (!f.file)
827                 return -EBADF;
828
829         css = css_tryget_online_from_dir(f.file->f_path.dentry,
830                                          &perf_event_cgrp_subsys);
831         if (IS_ERR(css)) {
832                 ret = PTR_ERR(css);
833                 goto out;
834         }
835
836         cgrp = container_of(css, struct perf_cgroup, css);
837         event->cgrp = cgrp;
838
839         /*
840          * all events in a group must monitor
841          * the same cgroup because a task belongs
842          * to only one perf cgroup at a time
843          */
844         if (group_leader && group_leader->cgrp != cgrp) {
845                 perf_detach_cgroup(event);
846                 ret = -EINVAL;
847         }
848 out:
849         fdput(f);
850         return ret;
851 }
852
853 static inline void
854 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
855 {
856         struct perf_cgroup_info *t;
857         t = per_cpu_ptr(event->cgrp->info, event->cpu);
858         event->shadow_ctx_time = now - t->timestamp;
859 }
860
861 static inline void
862 perf_cgroup_defer_enabled(struct perf_event *event)
863 {
864         /*
865          * when the current task's perf cgroup does not match
866          * the event's, we need to remember to call the
867          * perf_mark_enable() function the first time a task with
868          * a matching perf cgroup is scheduled in.
869          */
870         if (is_cgroup_event(event) && !perf_cgroup_match(event))
871                 event->cgrp_defer_enabled = 1;
872 }
873
874 static inline void
875 perf_cgroup_mark_enabled(struct perf_event *event,
876                          struct perf_event_context *ctx)
877 {
878         struct perf_event *sub;
879         u64 tstamp = perf_event_time(event);
880
881         if (!event->cgrp_defer_enabled)
882                 return;
883
884         event->cgrp_defer_enabled = 0;
885
886         event->tstamp_enabled = tstamp - event->total_time_enabled;
887         list_for_each_entry(sub, &event->sibling_list, group_entry) {
888                 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
889                         sub->tstamp_enabled = tstamp - sub->total_time_enabled;
890                         sub->cgrp_defer_enabled = 0;
891                 }
892         }
893 }
894
895 /*
896  * Update cpuctx->cgrp so that it is set when first cgroup event is added and
897  * cleared when last cgroup event is removed.
898  */
899 static inline void
900 list_update_cgroup_event(struct perf_event *event,
901                          struct perf_event_context *ctx, bool add)
902 {
903         struct perf_cpu_context *cpuctx;
904         struct list_head *cpuctx_entry;
905
906         if (!is_cgroup_event(event))
907                 return;
908
909         /*
910          * Because cgroup events are always per-cpu events,
911          * this will always be called from the right CPU.
912          */
913         cpuctx = __get_cpu_context(ctx);
914
915         /*
916          * Since setting cpuctx->cgrp is conditional on the current @cgrp
917          * matching the event's cgroup, we must do this for every new event,
918          * because if the first would mismatch, the second would not try again
919          * and we would leave cpuctx->cgrp unset.
920          */
921         if (add && !cpuctx->cgrp) {
922                 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
923
924                 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
925                         cpuctx->cgrp = cgrp;
926         }
927
928         if (add && ctx->nr_cgroups++)
929                 return;
930         else if (!add && --ctx->nr_cgroups)
931                 return;
932
933         /* no cgroup running */
934         if (!add)
935                 cpuctx->cgrp = NULL;
936
937         cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
938         if (add)
939                 list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
940         else
941                 list_del(cpuctx_entry);
942 }
943
944 #else /* !CONFIG_CGROUP_PERF */
945
946 static inline bool
947 perf_cgroup_match(struct perf_event *event)
948 {
949         return true;
950 }
951
952 static inline void perf_detach_cgroup(struct perf_event *event)
953 {}
954
955 static inline int is_cgroup_event(struct perf_event *event)
956 {
957         return 0;
958 }
959
960 static inline void update_cgrp_time_from_event(struct perf_event *event)
961 {
962 }
963
964 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
965 {
966 }
967
968 static inline void perf_cgroup_sched_out(struct task_struct *task,
969                                          struct task_struct *next)
970 {
971 }
972
973 static inline void perf_cgroup_sched_in(struct task_struct *prev,
974                                         struct task_struct *task)
975 {
976 }
977
978 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
979                                       struct perf_event_attr *attr,
980                                       struct perf_event *group_leader)
981 {
982         return -EINVAL;
983 }
984
985 static inline void
986 perf_cgroup_set_timestamp(struct task_struct *task,
987                           struct perf_event_context *ctx)
988 {
989 }
990
991 void
992 perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
993 {
994 }
995
996 static inline void
997 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
998 {
999 }
1000
1001 static inline u64 perf_cgroup_event_time(struct perf_event *event)
1002 {
1003         return 0;
1004 }
1005
1006 static inline void
1007 perf_cgroup_defer_enabled(struct perf_event *event)
1008 {
1009 }
1010
1011 static inline void
1012 perf_cgroup_mark_enabled(struct perf_event *event,
1013                          struct perf_event_context *ctx)
1014 {
1015 }
1016
1017 static inline void
1018 list_update_cgroup_event(struct perf_event *event,
1019                          struct perf_event_context *ctx, bool add)
1020 {
1021 }
1022
1023 #endif
1024
1025 /*
1026  * set default to be dependent on timer tick just
1027  * like original code
1028  */
1029 #define PERF_CPU_HRTIMER (1000 / HZ)
1030 /*
1031  * function must be called with interrupts disabled
1032  */
1033 static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1034 {
1035         struct perf_cpu_context *cpuctx;
1036         int rotations = 0;
1037
1038         WARN_ON(!irqs_disabled());
1039
1040         cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
1041         rotations = perf_rotate_context(cpuctx);
1042
1043         raw_spin_lock(&cpuctx->hrtimer_lock);
1044         if (rotations)
1045                 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
1046         else
1047                 cpuctx->hrtimer_active = 0;
1048         raw_spin_unlock(&cpuctx->hrtimer_lock);
1049
1050         return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1051 }
1052
1053 static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
1054 {
1055         struct hrtimer *timer = &cpuctx->hrtimer;
1056         struct pmu *pmu = cpuctx->ctx.pmu;
1057         u64 interval;
1058
1059         /* no multiplexing needed for SW PMU */
1060         if (pmu->task_ctx_nr == perf_sw_context)
1061                 return;
1062
1063         /*
1064          * check default is sane, if not set then force to
1065          * default interval (1/tick)
1066          */
1067         interval = pmu->hrtimer_interval_ms;
1068         if (interval < 1)
1069                 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1070
1071         cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1072
1073         raw_spin_lock_init(&cpuctx->hrtimer_lock);
1074         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1075         timer->function = perf_mux_hrtimer_handler;
1076 }
1077
1078 static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
1079 {
1080         struct hrtimer *timer = &cpuctx->hrtimer;
1081         struct pmu *pmu = cpuctx->ctx.pmu;
1082         unsigned long flags;
1083
1084         /* not for SW PMU */
1085         if (pmu->task_ctx_nr == perf_sw_context)
1086                 return 0;
1087
1088         raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
1089         if (!cpuctx->hrtimer_active) {
1090                 cpuctx->hrtimer_active = 1;
1091                 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1092                 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
1093         }
1094         raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1095
1096         return 0;
1097 }
1098
1099 static int perf_mux_hrtimer_restart_ipi(void *arg)
1100 {
1101         return perf_mux_hrtimer_restart(arg);
1102 }
1103
1104 void perf_pmu_disable(struct pmu *pmu)
1105 {
1106         int *count = this_cpu_ptr(pmu->pmu_disable_count);
1107         if (!(*count)++)
1108                 pmu->pmu_disable(pmu);
1109 }
1110
1111 void perf_pmu_enable(struct pmu *pmu)
1112 {
1113         int *count = this_cpu_ptr(pmu->pmu_disable_count);
1114         if (!--(*count))
1115                 pmu->pmu_enable(pmu);
1116 }
1117
1118 static DEFINE_PER_CPU(struct list_head, active_ctx_list);
1119
1120 /*
1121  * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
1122  * perf_event_task_tick() are fully serialized because they're strictly cpu
1123  * affine and perf_event_ctx{activate,deactivate} are called with IRQs
1124  * disabled, while perf_event_task_tick is called from IRQ context.
1125  */
1126 static void perf_event_ctx_activate(struct perf_event_context *ctx)
1127 {
1128         struct list_head *head = this_cpu_ptr(&active_ctx_list);
1129
1130         WARN_ON(!irqs_disabled());
1131
1132         WARN_ON(!list_empty(&ctx->active_ctx_list));
1133
1134         list_add(&ctx->active_ctx_list, head);
1135 }
1136
1137 static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1138 {
1139         WARN_ON(!irqs_disabled());
1140
1141         WARN_ON(list_empty(&ctx->active_ctx_list));
1142
1143         list_del_init(&ctx->active_ctx_list);
1144 }
1145
1146 static void get_ctx(struct perf_event_context *ctx)
1147 {
1148         WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
1149 }
1150
1151 static void free_ctx(struct rcu_head *head)
1152 {
1153         struct perf_event_context *ctx;
1154
1155         ctx = container_of(head, struct perf_event_context, rcu_head);
1156         kfree(ctx->task_ctx_data);
1157         kfree(ctx);
1158 }
1159
1160 static void put_ctx(struct perf_event_context *ctx)
1161 {
1162         if (atomic_dec_and_test(&ctx->refcount)) {
1163                 if (ctx->parent_ctx)
1164                         put_ctx(ctx->parent_ctx);
1165                 if (ctx->task && ctx->task != TASK_TOMBSTONE)
1166                         put_task_struct(ctx->task);
1167                 call_rcu(&ctx->rcu_head, free_ctx);
1168         }
1169 }
1170
1171 /*
1172  * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
1173  * perf_pmu_migrate_context() we need some magic.
1174  *
1175  * Those places that change perf_event::ctx will hold both
1176  * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
1177  *
1178  * Lock ordering is by mutex address. There are two other sites where
1179  * perf_event_context::mutex nests and those are:
1180  *
1181  *  - perf_event_exit_task_context()    [ child , 0 ]
1182  *      perf_event_exit_event()
1183  *        put_event()                   [ parent, 1 ]
1184  *
1185  *  - perf_event_init_context()         [ parent, 0 ]
1186  *      inherit_task_group()
1187  *        inherit_group()
1188  *          inherit_event()
1189  *            perf_event_alloc()
1190  *              perf_init_event()
1191  *                perf_try_init_event() [ child , 1 ]
1192  *
1193  * While it appears there is an obvious deadlock here -- the parent and child
1194  * nesting levels are inverted between the two. This is in fact safe because
1195  * life-time rules separate them. That is an exiting task cannot fork, and a
1196  * spawning task cannot (yet) exit.
1197  *
1198  * But remember that that these are parent<->child context relations, and
1199  * migration does not affect children, therefore these two orderings should not
1200  * interact.
1201  *
1202  * The change in perf_event::ctx does not affect children (as claimed above)
1203  * because the sys_perf_event_open() case will install a new event and break
1204  * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
1205  * concerned with cpuctx and that doesn't have children.
1206  *
1207  * The places that change perf_event::ctx will issue:
1208  *
1209  *   perf_remove_from_context();
1210  *   synchronize_rcu();
1211  *   perf_install_in_context();
1212  *
1213  * to affect the change. The remove_from_context() + synchronize_rcu() should
1214  * quiesce the event, after which we can install it in the new location. This
1215  * means that only external vectors (perf_fops, prctl) can perturb the event
1216  * while in transit. Therefore all such accessors should also acquire
1217  * perf_event_context::mutex to serialize against this.
1218  *
1219  * However; because event->ctx can change while we're waiting to acquire
1220  * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
1221  * function.
1222  *
1223  * Lock order:
1224  *    cred_guard_mutex
1225  *      task_struct::perf_event_mutex
1226  *        perf_event_context::mutex
1227  *          perf_event::child_mutex;
1228  *            perf_event_context::lock
1229  *          perf_event::mmap_mutex
1230  *          mmap_sem
1231  */
1232 static struct perf_event_context *
1233 perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1234 {
1235         struct perf_event_context *ctx;
1236
1237 again:
1238         rcu_read_lock();
1239         ctx = ACCESS_ONCE(event->ctx);
1240         if (!atomic_inc_not_zero(&ctx->refcount)) {
1241                 rcu_read_unlock();
1242                 goto again;
1243         }
1244         rcu_read_unlock();
1245
1246         mutex_lock_nested(&ctx->mutex, nesting);
1247         if (event->ctx != ctx) {
1248                 mutex_unlock(&ctx->mutex);
1249                 put_ctx(ctx);
1250                 goto again;
1251         }
1252
1253         return ctx;
1254 }
1255
1256 static inline struct perf_event_context *
1257 perf_event_ctx_lock(struct perf_event *event)
1258 {
1259         return perf_event_ctx_lock_nested(event, 0);
1260 }
1261
1262 static void perf_event_ctx_unlock(struct perf_event *event,
1263                                   struct perf_event_context *ctx)
1264 {
1265         mutex_unlock(&ctx->mutex);
1266         put_ctx(ctx);
1267 }
1268
1269 /*
1270  * This must be done under the ctx->lock, such as to serialize against
1271  * context_equiv(), therefore we cannot call put_ctx() since that might end up
1272  * calling scheduler related locks and ctx->lock nests inside those.
1273  */
1274 static __must_check struct perf_event_context *
1275 unclone_ctx(struct perf_event_context *ctx)
1276 {
1277         struct perf_event_context *parent_ctx = ctx->parent_ctx;
1278
1279         lockdep_assert_held(&ctx->lock);
1280
1281         if (parent_ctx)
1282                 ctx->parent_ctx = NULL;
1283         ctx->generation++;
1284
1285         return parent_ctx;
1286 }
1287
1288 static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
1289                                 enum pid_type type)
1290 {
1291         u32 nr;
1292         /*
1293          * only top level events have the pid namespace they were created in
1294          */
1295         if (event->parent)
1296                 event = event->parent;
1297
1298         nr = __task_pid_nr_ns(p, type, event->ns);
1299         /* avoid -1 if it is idle thread or runs in another ns */
1300         if (!nr && !pid_alive(p))
1301                 nr = -1;
1302         return nr;
1303 }
1304
1305 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1306 {
1307         return perf_event_pid_type(event, p, __PIDTYPE_TGID);
1308 }
1309
1310 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1311 {
1312         return perf_event_pid_type(event, p, PIDTYPE_PID);
1313 }
1314
1315 /*
1316  * If we inherit events we want to return the parent event id
1317  * to userspace.
1318  */
1319 static u64 primary_event_id(struct perf_event *event)
1320 {
1321         u64 id = event->id;
1322
1323         if (event->parent)
1324                 id = event->parent->id;
1325
1326         return id;
1327 }
1328
1329 /*
1330  * Get the perf_event_context for a task and lock it.
1331  *
1332  * This has to cope with with the fact that until it is locked,
1333  * the context could get moved to another task.
1334  */
1335 static struct perf_event_context *
1336 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1337 {
1338         struct perf_event_context *ctx;
1339
1340 retry:
1341         /*
1342          * One of the few rules of preemptible RCU is that one cannot do
1343          * rcu_read_unlock() while holding a scheduler (or nested) lock when
1344          * part of the read side critical section was irqs-enabled -- see
1345          * rcu_read_unlock_special().
1346          *
1347          * Since ctx->lock nests under rq->lock we must ensure the entire read
1348          * side critical section has interrupts disabled.
1349          */
1350         local_irq_save(*flags);
1351         rcu_read_lock();
1352         ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1353         if (ctx) {
1354                 /*
1355                  * If this context is a clone of another, it might
1356                  * get swapped for another underneath us by
1357                  * perf_event_task_sched_out, though the
1358                  * rcu_read_lock() protects us from any context
1359                  * getting freed.  Lock the context and check if it
1360                  * got swapped before we could get the lock, and retry
1361                  * if so.  If we locked the right context, then it
1362                  * can't get swapped on us any more.
1363                  */
1364                 raw_spin_lock(&ctx->lock);
1365                 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1366                         raw_spin_unlock(&ctx->lock);
1367                         rcu_read_unlock();
1368                         local_irq_restore(*flags);
1369                         goto retry;
1370                 }
1371
1372                 if (ctx->task == TASK_TOMBSTONE ||
1373                     !atomic_inc_not_zero(&ctx->refcount)) {
1374                         raw_spin_unlock(&ctx->lock);
1375                         ctx = NULL;
1376                 } else {
1377                         WARN_ON_ONCE(ctx->task != task);
1378                 }
1379         }
1380         rcu_read_unlock();
1381         if (!ctx)
1382                 local_irq_restore(*flags);
1383         return ctx;
1384 }
1385
1386 /*
1387  * Get the context for a task and increment its pin_count so it
1388  * can't get swapped to another task.  This also increments its
1389  * reference count so that the context can't get freed.
1390  */
1391 static struct perf_event_context *
1392 perf_pin_task_context(struct task_struct *task, int ctxn)
1393 {
1394         struct perf_event_context *ctx;
1395         unsigned long flags;
1396
1397         ctx = perf_lock_task_context(task, ctxn, &flags);
1398         if (ctx) {
1399                 ++ctx->pin_count;
1400                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1401         }
1402         return ctx;
1403 }
1404
1405 static void perf_unpin_context(struct perf_event_context *ctx)
1406 {
1407         unsigned long flags;
1408
1409         raw_spin_lock_irqsave(&ctx->lock, flags);
1410         --ctx->pin_count;
1411         raw_spin_unlock_irqrestore(&ctx->lock, flags);
1412 }
1413
1414 /*
1415  * Update the record of the current time in a context.
1416  */
1417 static void update_context_time(struct perf_event_context *ctx)
1418 {
1419         u64 now = perf_clock();
1420
1421         ctx->time += now - ctx->timestamp;
1422         ctx->timestamp = now;
1423 }
1424
1425 static u64 perf_event_time(struct perf_event *event)
1426 {
1427         struct perf_event_context *ctx = event->ctx;
1428
1429         if (is_cgroup_event(event))
1430                 return perf_cgroup_event_time(event);
1431
1432         return ctx ? ctx->time : 0;
1433 }
1434
1435 /*
1436  * Update the total_time_enabled and total_time_running fields for a event.
1437  */
1438 static void update_event_times(struct perf_event *event)
1439 {
1440         struct perf_event_context *ctx = event->ctx;
1441         u64 run_end;
1442
1443         lockdep_assert_held(&ctx->lock);
1444
1445         if (event->state < PERF_EVENT_STATE_INACTIVE ||
1446             event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
1447                 return;
1448
1449         /*
1450          * in cgroup mode, time_enabled represents
1451          * the time the event was enabled AND active
1452          * tasks were in the monitored cgroup. This is
1453          * independent of the activity of the context as
1454          * there may be a mix of cgroup and non-cgroup events.
1455          *
1456          * That is why we treat cgroup events differently
1457          * here.
1458          */
1459         if (is_cgroup_event(event))
1460                 run_end = perf_cgroup_event_time(event);
1461         else if (ctx->is_active)
1462                 run_end = ctx->time;
1463         else
1464                 run_end = event->tstamp_stopped;
1465
1466         event->total_time_enabled = run_end - event->tstamp_enabled;
1467
1468         if (event->state == PERF_EVENT_STATE_INACTIVE)
1469                 run_end = event->tstamp_stopped;
1470         else
1471                 run_end = perf_event_time(event);
1472
1473         event->total_time_running = run_end - event->tstamp_running;
1474
1475 }
1476
1477 /*
1478  * Update total_time_enabled and total_time_running for all events in a group.
1479  */
1480 static void update_group_times(struct perf_event *leader)
1481 {
1482         struct perf_event *event;
1483
1484         update_event_times(leader);
1485         list_for_each_entry(event, &leader->sibling_list, group_entry)
1486                 update_event_times(event);
1487 }
1488
1489 static enum event_type_t get_event_type(struct perf_event *event)
1490 {
1491         struct perf_event_context *ctx = event->ctx;
1492         enum event_type_t event_type;
1493
1494         lockdep_assert_held(&ctx->lock);
1495
1496         /*
1497          * It's 'group type', really, because if our group leader is
1498          * pinned, so are we.
1499          */
1500         if (event->group_leader != event)
1501                 event = event->group_leader;
1502
1503         event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
1504         if (!ctx->task)
1505                 event_type |= EVENT_CPU;
1506
1507         return event_type;
1508 }
1509
1510 static struct list_head *
1511 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
1512 {
1513         if (event->attr.pinned)
1514                 return &ctx->pinned_groups;
1515         else
1516                 return &ctx->flexible_groups;
1517 }
1518
1519 /*
1520  * Add a event from the lists for its context.
1521  * Must be called with ctx->mutex and ctx->lock held.
1522  */
1523 static void
1524 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1525 {
1526         lockdep_assert_held(&ctx->lock);
1527
1528         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1529         event->attach_state |= PERF_ATTACH_CONTEXT;
1530
1531         /*
1532          * If we're a stand alone event or group leader, we go to the context
1533          * list, group events are kept attached to the group so that
1534          * perf_group_detach can, at all times, locate all siblings.
1535          */
1536         if (event->group_leader == event) {
1537                 struct list_head *list;
1538
1539                 event->group_caps = event->event_caps;
1540
1541                 list = ctx_group_list(event, ctx);
1542                 list_add_tail(&event->group_entry, list);
1543         }
1544
1545         list_update_cgroup_event(event, ctx, true);
1546
1547         list_add_rcu(&event->event_entry, &ctx->event_list);
1548         ctx->nr_events++;
1549         if (event->attr.inherit_stat)
1550                 ctx->nr_stat++;
1551
1552         ctx->generation++;
1553 }
1554
1555 /*
1556  * Initialize event state based on the perf_event_attr::disabled.
1557  */
1558 static inline void perf_event__state_init(struct perf_event *event)
1559 {
1560         event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1561                                               PERF_EVENT_STATE_INACTIVE;
1562 }
1563
1564 static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1565 {
1566         int entry = sizeof(u64); /* value */
1567         int size = 0;
1568         int nr = 1;
1569
1570         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1571                 size += sizeof(u64);
1572
1573         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1574                 size += sizeof(u64);
1575
1576         if (event->attr.read_format & PERF_FORMAT_ID)
1577                 entry += sizeof(u64);
1578
1579         if (event->attr.read_format & PERF_FORMAT_GROUP) {
1580                 nr += nr_siblings;
1581                 size += sizeof(u64);
1582         }
1583
1584         size += entry * nr;
1585         event->read_size = size;
1586 }
1587
1588 static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1589 {
1590         struct perf_sample_data *data;
1591         u16 size = 0;
1592
1593         if (sample_type & PERF_SAMPLE_IP)
1594                 size += sizeof(data->ip);
1595
1596         if (sample_type & PERF_SAMPLE_ADDR)
1597                 size += sizeof(data->addr);
1598
1599         if (sample_type & PERF_SAMPLE_PERIOD)
1600                 size += sizeof(data->period);
1601
1602         if (sample_type & PERF_SAMPLE_WEIGHT)
1603                 size += sizeof(data->weight);
1604
1605         if (sample_type & PERF_SAMPLE_READ)
1606                 size += event->read_size;
1607
1608         if (sample_type & PERF_SAMPLE_DATA_SRC)
1609                 size += sizeof(data->data_src.val);
1610
1611         if (sample_type & PERF_SAMPLE_TRANSACTION)
1612                 size += sizeof(data->txn);
1613
1614         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
1615                 size += sizeof(data->phys_addr);
1616
1617         event->header_size = size;
1618 }
1619
1620 /*
1621  * Called at perf_event creation and when events are attached/detached from a
1622  * group.
1623  */
1624 static void perf_event__header_size(struct perf_event *event)
1625 {
1626         __perf_event_read_size(event,
1627                                event->group_leader->nr_siblings);
1628         __perf_event_header_size(event, event->attr.sample_type);
1629 }
1630
1631 static void perf_event__id_header_size(struct perf_event *event)
1632 {
1633         struct perf_sample_data *data;
1634         u64 sample_type = event->attr.sample_type;
1635         u16 size = 0;
1636
1637         if (sample_type & PERF_SAMPLE_TID)
1638                 size += sizeof(data->tid_entry);
1639
1640         if (sample_type & PERF_SAMPLE_TIME)
1641                 size += sizeof(data->time);
1642
1643         if (sample_type & PERF_SAMPLE_IDENTIFIER)
1644                 size += sizeof(data->id);
1645
1646         if (sample_type & PERF_SAMPLE_ID)
1647                 size += sizeof(data->id);
1648
1649         if (sample_type & PERF_SAMPLE_STREAM_ID)
1650                 size += sizeof(data->stream_id);
1651
1652         if (sample_type & PERF_SAMPLE_CPU)
1653                 size += sizeof(data->cpu_entry);
1654
1655         event->id_header_size = size;
1656 }
1657
1658 static bool perf_event_validate_size(struct perf_event *event)
1659 {
1660         /*
1661          * The values computed here will be over-written when we actually
1662          * attach the event.
1663          */
1664         __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1665         __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1666         perf_event__id_header_size(event);
1667
1668         /*
1669          * Sum the lot; should not exceed the 64k limit we have on records.
1670          * Conservative limit to allow for callchains and other variable fields.
1671          */
1672         if (event->read_size + event->header_size +
1673             event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1674                 return false;
1675
1676         return true;
1677 }
1678
1679 static void perf_group_attach(struct perf_event *event)
1680 {
1681         struct perf_event *group_leader = event->group_leader, *pos;
1682
1683         lockdep_assert_held(&event->ctx->lock);
1684
1685         /*
1686          * We can have double attach due to group movement in perf_event_open.
1687          */
1688         if (event->attach_state & PERF_ATTACH_GROUP)
1689                 return;
1690
1691         event->attach_state |= PERF_ATTACH_GROUP;
1692
1693         if (group_leader == event)
1694                 return;
1695
1696         WARN_ON_ONCE(group_leader->ctx != event->ctx);
1697
1698         group_leader->group_caps &= event->event_caps;
1699
1700         list_add_tail(&event->group_entry, &group_leader->sibling_list);
1701         group_leader->nr_siblings++;
1702
1703         perf_event__header_size(group_leader);
1704
1705         list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1706                 perf_event__header_size(pos);
1707 }
1708
1709 /*
1710  * Remove a event from the lists for its context.
1711  * Must be called with ctx->mutex and ctx->lock held.
1712  */
1713 static void
1714 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1715 {
1716         WARN_ON_ONCE(event->ctx != ctx);
1717         lockdep_assert_held(&ctx->lock);
1718
1719         /*
1720          * We can have double detach due to exit/hot-unplug + close.
1721          */
1722         if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1723                 return;
1724
1725         event->attach_state &= ~PERF_ATTACH_CONTEXT;
1726
1727         list_update_cgroup_event(event, ctx, false);
1728
1729         ctx->nr_events--;
1730         if (event->attr.inherit_stat)
1731                 ctx->nr_stat--;
1732
1733         list_del_rcu(&event->event_entry);
1734
1735         if (event->group_leader == event)
1736                 list_del_init(&event->group_entry);
1737
1738         update_group_times(event);
1739
1740         /*
1741          * If event was in error state, then keep it
1742          * that way, otherwise bogus counts will be
1743          * returned on read(). The only way to get out
1744          * of error state is by explicit re-enabling
1745          * of the event
1746          */
1747         if (event->state > PERF_EVENT_STATE_OFF)
1748                 event->state = PERF_EVENT_STATE_OFF;
1749
1750         ctx->generation++;
1751 }
1752
1753 static void perf_group_detach(struct perf_event *event)
1754 {
1755         struct perf_event *sibling, *tmp;
1756         struct list_head *list = NULL;
1757
1758         lockdep_assert_held(&event->ctx->lock);
1759
1760         /*
1761          * We can have double detach due to exit/hot-unplug + close.
1762          */
1763         if (!(event->attach_state & PERF_ATTACH_GROUP))
1764                 return;
1765
1766         event->attach_state &= ~PERF_ATTACH_GROUP;
1767
1768         /*
1769          * If this is a sibling, remove it from its group.
1770          */
1771         if (event->group_leader != event) {
1772                 list_del_init(&event->group_entry);
1773                 event->group_leader->nr_siblings--;
1774                 goto out;
1775         }
1776
1777         if (!list_empty(&event->group_entry))
1778                 list = &event->group_entry;
1779
1780         /*
1781          * If this was a group event with sibling events then
1782          * upgrade the siblings to singleton events by adding them
1783          * to whatever list we are on.
1784          */
1785         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
1786                 if (list)
1787                         list_move_tail(&sibling->group_entry, list);
1788                 sibling->group_leader = sibling;
1789
1790                 /* Inherit group flags from the previous leader */
1791                 sibling->group_caps = event->group_caps;
1792
1793                 WARN_ON_ONCE(sibling->ctx != event->ctx);
1794         }
1795
1796 out:
1797         perf_event__header_size(event->group_leader);
1798
1799         list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1800                 perf_event__header_size(tmp);
1801 }
1802
1803 static bool is_orphaned_event(struct perf_event *event)
1804 {
1805         return event->state == PERF_EVENT_STATE_DEAD;
1806 }
1807
1808 static inline int __pmu_filter_match(struct perf_event *event)
1809 {
1810         struct pmu *pmu = event->pmu;
1811         return pmu->filter_match ? pmu->filter_match(event) : 1;
1812 }
1813
1814 /*
1815  * Check whether we should attempt to schedule an event group based on
1816  * PMU-specific filtering. An event group can consist of HW and SW events,
1817  * potentially with a SW leader, so we must check all the filters, to
1818  * determine whether a group is schedulable:
1819  */
1820 static inline int pmu_filter_match(struct perf_event *event)
1821 {
1822         struct perf_event *child;
1823
1824         if (!__pmu_filter_match(event))
1825                 return 0;
1826
1827         list_for_each_entry(child, &event->sibling_list, group_entry) {
1828                 if (!__pmu_filter_match(child))
1829                         return 0;
1830         }
1831
1832         return 1;
1833 }
1834
1835 static inline int
1836 event_filter_match(struct perf_event *event)
1837 {
1838         return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
1839                perf_cgroup_match(event) && pmu_filter_match(event);
1840 }
1841
1842 static void
1843 event_sched_out(struct perf_event *event,
1844                   struct perf_cpu_context *cpuctx,
1845                   struct perf_event_context *ctx)
1846 {
1847         u64 tstamp = perf_event_time(event);
1848         u64 delta;
1849
1850         WARN_ON_ONCE(event->ctx != ctx);
1851         lockdep_assert_held(&ctx->lock);
1852
1853         /*
1854          * An event which could not be activated because of
1855          * filter mismatch still needs to have its timings
1856          * maintained, otherwise bogus information is return
1857          * via read() for time_enabled, time_running:
1858          */
1859         if (event->state == PERF_EVENT_STATE_INACTIVE &&
1860             !event_filter_match(event)) {
1861                 delta = tstamp - event->tstamp_stopped;
1862                 event->tstamp_running += delta;
1863                 event->tstamp_stopped = tstamp;
1864         }
1865
1866         if (event->state != PERF_EVENT_STATE_ACTIVE)
1867                 return;
1868
1869         perf_pmu_disable(event->pmu);
1870
1871         event->tstamp_stopped = tstamp;
1872         event->pmu->del(event, 0);
1873         event->oncpu = -1;
1874         event->state = PERF_EVENT_STATE_INACTIVE;
1875         if (event->pending_disable) {
1876                 event->pending_disable = 0;
1877                 event->state = PERF_EVENT_STATE_OFF;
1878         }
1879
1880         if (!is_software_event(event))
1881                 cpuctx->active_oncpu--;
1882         if (!--ctx->nr_active)
1883                 perf_event_ctx_deactivate(ctx);
1884         if (event->attr.freq && event->attr.sample_freq)
1885                 ctx->nr_freq--;
1886         if (event->attr.exclusive || !cpuctx->active_oncpu)
1887                 cpuctx->exclusive = 0;
1888
1889         perf_pmu_enable(event->pmu);
1890 }
1891
1892 static void
1893 group_sched_out(struct perf_event *group_event,
1894                 struct perf_cpu_context *cpuctx,
1895                 struct perf_event_context *ctx)
1896 {
1897         struct perf_event *event;
1898         int state = group_event->state;
1899
1900         perf_pmu_disable(ctx->pmu);
1901
1902         event_sched_out(group_event, cpuctx, ctx);
1903
1904         /*
1905          * Schedule out siblings (if any):
1906          */
1907         list_for_each_entry(event, &group_event->sibling_list, group_entry)
1908                 event_sched_out(event, cpuctx, ctx);
1909
1910         perf_pmu_enable(ctx->pmu);
1911
1912         if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
1913                 cpuctx->exclusive = 0;
1914 }
1915
1916 #define DETACH_GROUP    0x01UL
1917
1918 /*
1919  * Cross CPU call to remove a performance event
1920  *
1921  * We disable the event on the hardware level first. After that we
1922  * remove it from the context list.
1923  */
1924 static void
1925 __perf_remove_from_context(struct perf_event *event,
1926                            struct perf_cpu_context *cpuctx,
1927                            struct perf_event_context *ctx,
1928                            void *info)
1929 {
1930         unsigned long flags = (unsigned long)info;
1931
1932         event_sched_out(event, cpuctx, ctx);
1933         if (flags & DETACH_GROUP)
1934                 perf_group_detach(event);
1935         list_del_event(event, ctx);
1936
1937         if (!ctx->nr_events && ctx->is_active) {
1938                 ctx->is_active = 0;
1939                 if (ctx->task) {
1940                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
1941                         cpuctx->task_ctx = NULL;
1942                 }
1943         }
1944 }
1945
1946 /*
1947  * Remove the event from a task's (or a CPU's) list of events.
1948  *
1949  * If event->ctx is a cloned context, callers must make sure that
1950  * every task struct that event->ctx->task could possibly point to
1951  * remains valid.  This is OK when called from perf_release since
1952  * that only calls us on the top-level context, which can't be a clone.
1953  * When called from perf_event_exit_task, it's OK because the
1954  * context has been detached from its task.
1955  */
1956 static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
1957 {
1958         struct perf_event_context *ctx = event->ctx;
1959
1960         lockdep_assert_held(&ctx->mutex);
1961
1962         event_function_call(event, __perf_remove_from_context, (void *)flags);
1963
1964         /*
1965          * The above event_function_call() can NO-OP when it hits
1966          * TASK_TOMBSTONE. In that case we must already have been detached
1967          * from the context (by perf_event_exit_event()) but the grouping
1968          * might still be in-tact.
1969          */
1970         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1971         if ((flags & DETACH_GROUP) &&
1972             (event->attach_state & PERF_ATTACH_GROUP)) {
1973                 /*
1974                  * Since in that case we cannot possibly be scheduled, simply
1975                  * detach now.
1976                  */
1977                 raw_spin_lock_irq(&ctx->lock);
1978                 perf_group_detach(event);
1979                 raw_spin_unlock_irq(&ctx->lock);
1980         }
1981 }
1982
1983 /*
1984  * Cross CPU call to disable a performance event
1985  */
1986 static void __perf_event_disable(struct perf_event *event,
1987                                  struct perf_cpu_context *cpuctx,
1988                                  struct perf_event_context *ctx,
1989                                  void *info)
1990 {
1991         if (event->state < PERF_EVENT_STATE_INACTIVE)
1992                 return;
1993
1994         update_context_time(ctx);
1995         update_cgrp_time_from_event(event);
1996         update_group_times(event);
1997         if (event == event->group_leader)
1998                 group_sched_out(event, cpuctx, ctx);
1999         else
2000                 event_sched_out(event, cpuctx, ctx);
2001         event->state = PERF_EVENT_STATE_OFF;
2002 }
2003
2004 /*
2005  * Disable a event.
2006  *
2007  * If event->ctx is a cloned context, callers must make sure that
2008  * every task struct that event->ctx->task could possibly point to
2009  * remains valid.  This condition is satisifed when called through
2010  * perf_event_for_each_child or perf_event_for_each because they
2011  * hold the top-level event's child_mutex, so any descendant that
2012  * goes to exit will block in perf_event_exit_event().
2013  *
2014  * When called from perf_pending_event it's OK because event->ctx
2015  * is the current context on this CPU and preemption is disabled,
2016  * hence we can't get into perf_event_task_sched_out for this context.
2017  */
2018 static void _perf_event_disable(struct perf_event *event)
2019 {
2020         struct perf_event_context *ctx = event->ctx;
2021
2022         raw_spin_lock_irq(&ctx->lock);
2023         if (event->state <= PERF_EVENT_STATE_OFF) {
2024                 raw_spin_unlock_irq(&ctx->lock);
2025                 return;
2026         }
2027         raw_spin_unlock_irq(&ctx->lock);
2028
2029         event_function_call(event, __perf_event_disable, NULL);
2030 }
2031
2032 void perf_event_disable_local(struct perf_event *event)
2033 {
2034         event_function_local(event, __perf_event_disable, NULL);
2035 }
2036
2037 /*
2038  * Strictly speaking kernel users cannot create groups and therefore this
2039  * interface does not need the perf_event_ctx_lock() magic.
2040  */
2041 void perf_event_disable(struct perf_event *event)
2042 {
2043         struct perf_event_context *ctx;
2044
2045         ctx = perf_event_ctx_lock(event);
2046         _perf_event_disable(event);
2047         perf_event_ctx_unlock(event, ctx);
2048 }
2049 EXPORT_SYMBOL_GPL(perf_event_disable);
2050
2051 void perf_event_disable_inatomic(struct perf_event *event)
2052 {
2053         event->pending_disable = 1;
2054         irq_work_queue(&event->pending);
2055 }
2056
2057 static void perf_set_shadow_time(struct perf_event *event,
2058                                  struct perf_event_context *ctx,
2059                                  u64 tstamp)
2060 {
2061         /*
2062          * use the correct time source for the time snapshot
2063          *
2064          * We could get by without this by leveraging the
2065          * fact that to get to this function, the caller
2066          * has most likely already called update_context_time()
2067          * and update_cgrp_time_xx() and thus both timestamp
2068          * are identical (or very close). Given that tstamp is,
2069          * already adjusted for cgroup, we could say that:
2070          *    tstamp - ctx->timestamp
2071          * is equivalent to
2072          *    tstamp - cgrp->timestamp.
2073          *
2074          * Then, in perf_output_read(), the calculation would
2075          * work with no changes because:
2076          * - event is guaranteed scheduled in
2077          * - no scheduled out in between
2078          * - thus the timestamp would be the same
2079          *
2080          * But this is a bit hairy.
2081          *
2082          * So instead, we have an explicit cgroup call to remain
2083          * within the time time source all along. We believe it
2084          * is cleaner and simpler to understand.
2085          */
2086         if (is_cgroup_event(event))
2087                 perf_cgroup_set_shadow_time(event, tstamp);
2088         else
2089                 event->shadow_ctx_time = tstamp - ctx->timestamp;
2090 }
2091
2092 #define MAX_INTERRUPTS (~0ULL)
2093
2094 static void perf_log_throttle(struct perf_event *event, int enable);
2095 static void perf_log_itrace_start(struct perf_event *event);
2096
2097 static int
2098 event_sched_in(struct perf_event *event,
2099                  struct perf_cpu_context *cpuctx,
2100                  struct perf_event_context *ctx)
2101 {
2102         u64 tstamp = perf_event_time(event);
2103         int ret = 0;
2104
2105         lockdep_assert_held(&ctx->lock);
2106
2107         if (event->state <= PERF_EVENT_STATE_OFF)
2108                 return 0;
2109
2110         WRITE_ONCE(event->oncpu, smp_processor_id());
2111         /*
2112          * Order event::oncpu write to happen before the ACTIVE state
2113          * is visible.
2114          */
2115         smp_wmb();
2116         WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
2117
2118         /*
2119          * Unthrottle events, since we scheduled we might have missed several
2120          * ticks already, also for a heavily scheduling task there is little
2121          * guarantee it'll get a tick in a timely manner.
2122          */
2123         if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2124                 perf_log_throttle(event, 1);
2125                 event->hw.interrupts = 0;
2126         }
2127
2128         /*
2129          * The new state must be visible before we turn it on in the hardware:
2130          */
2131         smp_wmb();
2132
2133         perf_pmu_disable(event->pmu);
2134
2135         perf_set_shadow_time(event, ctx, tstamp);
2136
2137         perf_log_itrace_start(event);
2138
2139         if (event->pmu->add(event, PERF_EF_START)) {
2140                 event->state = PERF_EVENT_STATE_INACTIVE;
2141                 event->oncpu = -1;
2142                 ret = -EAGAIN;
2143                 goto out;
2144         }
2145
2146         event->tstamp_running += tstamp - event->tstamp_stopped;
2147
2148         if (!is_software_event(event))
2149                 cpuctx->active_oncpu++;
2150         if (!ctx->nr_active++)
2151                 perf_event_ctx_activate(ctx);
2152         if (event->attr.freq && event->attr.sample_freq)
2153                 ctx->nr_freq++;
2154
2155         if (event->attr.exclusive)
2156                 cpuctx->exclusive = 1;
2157
2158 out:
2159         perf_pmu_enable(event->pmu);
2160
2161         return ret;
2162 }
2163
2164 static int
2165 group_sched_in(struct perf_event *group_event,
2166                struct perf_cpu_context *cpuctx,
2167                struct perf_event_context *ctx)
2168 {
2169         struct perf_event *event, *partial_group = NULL;
2170         struct pmu *pmu = ctx->pmu;
2171         u64 now = ctx->time;
2172         bool simulate = false;
2173
2174         if (group_event->state == PERF_EVENT_STATE_OFF)
2175                 return 0;
2176
2177         pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2178
2179         if (event_sched_in(group_event, cpuctx, ctx)) {
2180                 pmu->cancel_txn(pmu);
2181                 perf_mux_hrtimer_restart(cpuctx);
2182                 return -EAGAIN;
2183         }
2184
2185         /*
2186          * Schedule in siblings as one group (if any):
2187          */
2188         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
2189                 if (event_sched_in(event, cpuctx, ctx)) {
2190                         partial_group = event;
2191                         goto group_error;
2192                 }
2193         }
2194
2195         if (!pmu->commit_txn(pmu))
2196                 return 0;
2197
2198 group_error:
2199         /*
2200          * Groups can be scheduled in as one unit only, so undo any
2201          * partial group before returning:
2202          * The events up to the failed event are scheduled out normally,
2203          * tstamp_stopped will be updated.
2204          *
2205          * The failed events and the remaining siblings need to have
2206          * their timings updated as if they had gone thru event_sched_in()
2207          * and event_sched_out(). This is required to get consistent timings
2208          * across the group. This also takes care of the case where the group
2209          * could never be scheduled by ensuring tstamp_stopped is set to mark
2210          * the time the event was actually stopped, such that time delta
2211          * calculation in update_event_times() is correct.
2212          */
2213         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
2214                 if (event == partial_group)
2215                         simulate = true;
2216
2217                 if (simulate) {
2218                         event->tstamp_running += now - event->tstamp_stopped;
2219                         event->tstamp_stopped = now;
2220                 } else {
2221                         event_sched_out(event, cpuctx, ctx);
2222                 }
2223         }
2224         event_sched_out(group_event, cpuctx, ctx);
2225
2226         pmu->cancel_txn(pmu);
2227
2228         perf_mux_hrtimer_restart(cpuctx);
2229
2230         return -EAGAIN;
2231 }
2232
2233 /*
2234  * Work out whether we can put this event group on the CPU now.
2235  */
2236 static int group_can_go_on(struct perf_event *event,
2237                            struct perf_cpu_context *cpuctx,
2238                            int can_add_hw)
2239 {
2240         /*
2241          * Groups consisting entirely of software events can always go on.
2242          */
2243         if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2244                 return 1;
2245         /*
2246          * If an exclusive group is already on, no other hardware
2247          * events can go on.
2248          */
2249         if (cpuctx->exclusive)
2250                 return 0;
2251         /*
2252          * If this group is exclusive and there are already
2253          * events on the CPU, it can't go on.
2254          */
2255         if (event->attr.exclusive && cpuctx->active_oncpu)
2256                 return 0;
2257         /*
2258          * Otherwise, try to add it if all previous groups were able
2259          * to go on.
2260          */
2261         return can_add_hw;
2262 }
2263
2264 /*
2265  * Complement to update_event_times(). This computes the tstamp_* values to
2266  * continue 'enabled' state from @now, and effectively discards the time
2267  * between the prior tstamp_stopped and now (as we were in the OFF state, or
2268  * just switched (context) time base).
2269  *
2270  * This further assumes '@event->state == INACTIVE' (we just came from OFF) and
2271  * cannot have been scheduled in yet. And going into INACTIVE state means
2272  * '@event->tstamp_stopped = @now'.
2273  *
2274  * Thus given the rules of update_event_times():
2275  *
2276  *   total_time_enabled = tstamp_stopped - tstamp_enabled
2277  *   total_time_running = tstamp_stopped - tstamp_running
2278  *
2279  * We can insert 'tstamp_stopped == now' and reverse them to compute new
2280  * tstamp_* values.
2281  */
2282 static void __perf_event_enable_time(struct perf_event *event, u64 now)
2283 {
2284         WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE);
2285
2286         event->tstamp_stopped = now;
2287         event->tstamp_enabled = now - event->total_time_enabled;
2288         event->tstamp_running = now - event->total_time_running;
2289 }
2290
2291 static void add_event_to_ctx(struct perf_event *event,
2292                                struct perf_event_context *ctx)
2293 {
2294         u64 tstamp = perf_event_time(event);
2295
2296         list_add_event(event, ctx);
2297         perf_group_attach(event);
2298         /*
2299          * We can be called with event->state == STATE_OFF when we create with
2300          * .disabled = 1. In that case the IOC_ENABLE will call this function.
2301          */
2302         if (event->state == PERF_EVENT_STATE_INACTIVE)
2303                 __perf_event_enable_time(event, tstamp);
2304 }
2305
2306 static void ctx_sched_out(struct perf_event_context *ctx,
2307                           struct perf_cpu_context *cpuctx,
2308                           enum event_type_t event_type);
2309 static void
2310 ctx_sched_in(struct perf_event_context *ctx,
2311              struct perf_cpu_context *cpuctx,
2312              enum event_type_t event_type,
2313              struct task_struct *task);
2314
2315 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2316                                struct perf_event_context *ctx,
2317                                enum event_type_t event_type)
2318 {
2319         if (!cpuctx->task_ctx)
2320                 return;
2321
2322         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2323                 return;
2324
2325         ctx_sched_out(ctx, cpuctx, event_type);
2326 }
2327
2328 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2329                                 struct perf_event_context *ctx,
2330                                 struct task_struct *task)
2331 {
2332         cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2333         if (ctx)
2334                 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2335         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2336         if (ctx)
2337                 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2338 }
2339
2340 /*
2341  * We want to maintain the following priority of scheduling:
2342  *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
2343  *  - task pinned (EVENT_PINNED)
2344  *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
2345  *  - task flexible (EVENT_FLEXIBLE).
2346  *
2347  * In order to avoid unscheduling and scheduling back in everything every
2348  * time an event is added, only do it for the groups of equal priority and
2349  * below.
2350  *
2351  * This can be called after a batch operation on task events, in which case
2352  * event_type is a bit mask of the types of events involved. For CPU events,
2353  * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
2354  */
2355 static void ctx_resched(struct perf_cpu_context *cpuctx,
2356                         struct perf_event_context *task_ctx,
2357                         enum event_type_t event_type)
2358 {
2359         enum event_type_t ctx_event_type;
2360         bool cpu_event = !!(event_type & EVENT_CPU);
2361
2362         /*
2363          * If pinned groups are involved, flexible groups also need to be
2364          * scheduled out.
2365          */
2366         if (event_type & EVENT_PINNED)
2367                 event_type |= EVENT_FLEXIBLE;
2368
2369         ctx_event_type = event_type & EVENT_ALL;
2370
2371         perf_pmu_disable(cpuctx->ctx.pmu);
2372         if (task_ctx)
2373                 task_ctx_sched_out(cpuctx, task_ctx, event_type);
2374
2375         /*
2376          * Decide which cpu ctx groups to schedule out based on the types
2377          * of events that caused rescheduling:
2378          *  - EVENT_CPU: schedule out corresponding groups;
2379          *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
2380          *  - otherwise, do nothing more.
2381          */
2382         if (cpu_event)
2383                 cpu_ctx_sched_out(cpuctx, ctx_event_type);
2384         else if (ctx_event_type & EVENT_PINNED)
2385                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2386
2387         perf_event_sched_in(cpuctx, task_ctx, current);
2388         perf_pmu_enable(cpuctx->ctx.pmu);
2389 }
2390
2391 /*
2392  * Cross CPU call to install and enable a performance event
2393  *
2394  * Very similar to remote_function() + event_function() but cannot assume that
2395  * things like ctx->is_active and cpuctx->task_ctx are set.
2396  */
2397 static int  __perf_install_in_context(void *info)
2398 {
2399         struct perf_event *event = info;
2400         struct perf_event_context *ctx = event->ctx;
2401         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2402         struct perf_event_context *task_ctx = cpuctx->task_ctx;
2403         bool reprogram = true;
2404         int ret = 0;
2405
2406         raw_spin_lock(&cpuctx->ctx.lock);
2407         if (ctx->task) {
2408                 raw_spin_lock(&ctx->lock);
2409                 task_ctx = ctx;
2410
2411                 reprogram = (ctx->task == current);
2412
2413                 /*
2414                  * If the task is running, it must be running on this CPU,
2415                  * otherwise we cannot reprogram things.
2416                  *
2417                  * If its not running, we don't care, ctx->lock will
2418                  * serialize against it becoming runnable.
2419                  */
2420                 if (task_curr(ctx->task) && !reprogram) {
2421                         ret = -ESRCH;
2422                         goto unlock;
2423                 }
2424
2425                 WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2426         } else if (task_ctx) {
2427                 raw_spin_lock(&task_ctx->lock);
2428         }
2429
2430 #ifdef CONFIG_CGROUP_PERF
2431         if (is_cgroup_event(event)) {
2432                 /*
2433                  * If the current cgroup doesn't match the event's
2434                  * cgroup, we should not try to schedule it.
2435                  */
2436                 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
2437                 reprogram = cgroup_is_descendant(cgrp->css.cgroup,
2438                                         event->cgrp->css.cgroup);
2439         }
2440 #endif
2441
2442         if (reprogram) {
2443                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2444                 add_event_to_ctx(event, ctx);
2445                 ctx_resched(cpuctx, task_ctx, get_event_type(event));
2446         } else {
2447                 add_event_to_ctx(event, ctx);
2448         }
2449
2450 unlock:
2451         perf_ctx_unlock(cpuctx, task_ctx);
2452
2453         return ret;
2454 }
2455
2456 /*
2457  * Attach a performance event to a context.
2458  *
2459  * Very similar to event_function_call, see comment there.
2460  */
2461 static void
2462 perf_install_in_context(struct perf_event_context *ctx,
2463                         struct perf_event *event,
2464                         int cpu)
2465 {
2466         struct task_struct *task = READ_ONCE(ctx->task);
2467
2468         lockdep_assert_held(&ctx->mutex);
2469
2470         if (event->cpu != -1)
2471                 event->cpu = cpu;
2472
2473         /*
2474          * Ensures that if we can observe event->ctx, both the event and ctx
2475          * will be 'complete'. See perf_iterate_sb_cpu().
2476          */
2477         smp_store_release(&event->ctx, ctx);
2478
2479         if (!task) {
2480                 cpu_function_call(cpu, __perf_install_in_context, event);
2481                 return;
2482         }
2483
2484         /*
2485          * Should not happen, we validate the ctx is still alive before calling.
2486          */
2487         if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2488                 return;
2489
2490         /*
2491          * Installing events is tricky because we cannot rely on ctx->is_active
2492          * to be set in case this is the nr_events 0 -> 1 transition.
2493          *
2494          * Instead we use task_curr(), which tells us if the task is running.
2495          * However, since we use task_curr() outside of rq::lock, we can race
2496          * against the actual state. This means the result can be wrong.
2497          *
2498          * If we get a false positive, we retry, this is harmless.
2499          *
2500          * If we get a false negative, things are complicated. If we are after
2501          * perf_event_context_sched_in() ctx::lock will serialize us, and the
2502          * value must be correct. If we're before, it doesn't matter since
2503          * perf_event_context_sched_in() will program the counter.
2504          *
2505          * However, this hinges on the remote context switch having observed
2506          * our task->perf_event_ctxp[] store, such that it will in fact take
2507          * ctx::lock in perf_event_context_sched_in().
2508          *
2509          * We do this by task_function_call(), if the IPI fails to hit the task
2510          * we know any future context switch of task must see the
2511          * perf_event_ctpx[] store.
2512          */
2513
2514         /*
2515          * This smp_mb() orders the task->perf_event_ctxp[] store with the
2516          * task_cpu() load, such that if the IPI then does not find the task
2517          * running, a future context switch of that task must observe the
2518          * store.
2519          */
2520         smp_mb();
2521 again:
2522         if (!task_function_call(task, __perf_install_in_context, event))
2523                 return;
2524
2525         raw_spin_lock_irq(&ctx->lock);
2526         task = ctx->task;
2527         if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2528                 /*
2529                  * Cannot happen because we already checked above (which also
2530                  * cannot happen), and we hold ctx->mutex, which serializes us
2531                  * against perf_event_exit_task_context().
2532                  */
2533                 raw_spin_unlock_irq(&ctx->lock);
2534                 return;
2535         }
2536         /*
2537          * If the task is not running, ctx->lock will avoid it becoming so,
2538          * thus we can safely install the event.
2539          */
2540         if (task_curr(task)) {
2541                 raw_spin_unlock_irq(&ctx->lock);
2542                 goto again;
2543         }
2544         add_event_to_ctx(event, ctx);
2545         raw_spin_unlock_irq(&ctx->lock);
2546 }
2547
2548 /*
2549  * Put a event into inactive state and update time fields.
2550  * Enabling the leader of a group effectively enables all
2551  * the group members that aren't explicitly disabled, so we
2552  * have to update their ->tstamp_enabled also.
2553  * Note: this works for group members as well as group leaders
2554  * since the non-leader members' sibling_lists will be empty.
2555  */
2556 static void __perf_event_mark_enabled(struct perf_event *event)
2557 {
2558         struct perf_event *sub;
2559         u64 tstamp = perf_event_time(event);
2560
2561         event->state = PERF_EVENT_STATE_INACTIVE;
2562         __perf_event_enable_time(event, tstamp);
2563         list_for_each_entry(sub, &event->sibling_list, group_entry) {
2564                 /* XXX should not be > INACTIVE if event isn't */
2565                 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
2566                         __perf_event_enable_time(sub, tstamp);
2567         }
2568 }
2569
2570 /*
2571  * Cross CPU call to enable a performance event
2572  */
2573 static void __perf_event_enable(struct perf_event *event,
2574                                 struct perf_cpu_context *cpuctx,
2575                                 struct perf_event_context *ctx,
2576                                 void *info)
2577 {
2578         struct perf_event *leader = event->group_leader;
2579         struct perf_event_context *task_ctx;
2580
2581         if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2582             event->state <= PERF_EVENT_STATE_ERROR)
2583                 return;
2584
2585         if (ctx->is_active)
2586                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2587
2588         __perf_event_mark_enabled(event);
2589
2590         if (!ctx->is_active)
2591                 return;
2592
2593         if (!event_filter_match(event)) {
2594                 if (is_cgroup_event(event))
2595                         perf_cgroup_defer_enabled(event);
2596                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2597                 return;
2598         }
2599
2600         /*
2601          * If the event is in a group and isn't the group leader,
2602          * then don't put it on unless the group is on.
2603          */
2604         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2605                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2606                 return;
2607         }
2608
2609         task_ctx = cpuctx->task_ctx;
2610         if (ctx->task)
2611                 WARN_ON_ONCE(task_ctx != ctx);
2612
2613         ctx_resched(cpuctx, task_ctx, get_event_type(event));
2614 }
2615
2616 /*
2617  * Enable a event.
2618  *
2619  * If event->ctx is a cloned context, callers must make sure that
2620  * every task struct that event->ctx->task could possibly point to
2621  * remains valid.  This condition is satisfied when called through
2622  * perf_event_for_each_child or perf_event_for_each as described
2623  * for perf_event_disable.
2624  */
2625 static void _perf_event_enable(struct perf_event *event)
2626 {
2627         struct perf_event_context *ctx = event->ctx;
2628
2629         raw_spin_lock_irq(&ctx->lock);
2630         if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2631             event->state <  PERF_EVENT_STATE_ERROR) {
2632                 raw_spin_unlock_irq(&ctx->lock);
2633                 return;
2634         }
2635
2636         /*
2637          * If the event is in error state, clear that first.
2638          *
2639          * That way, if we see the event in error state below, we know that it
2640          * has gone back into error state, as distinct from the task having
2641          * been scheduled away before the cross-call arrived.
2642          */
2643         if (event->state == PERF_EVENT_STATE_ERROR)
2644                 event->state = PERF_EVENT_STATE_OFF;
2645         raw_spin_unlock_irq(&ctx->lock);
2646
2647         event_function_call(event, __perf_event_enable, NULL);
2648 }
2649
2650 /*
2651  * See perf_event_disable();
2652  */
2653 void perf_event_enable(struct perf_event *event)
2654 {
2655         struct perf_event_context *ctx;
2656
2657         ctx = perf_event_ctx_lock(event);
2658         _perf_event_enable(event);
2659         perf_event_ctx_unlock(event, ctx);
2660 }
2661 EXPORT_SYMBOL_GPL(perf_event_enable);
2662
2663 struct stop_event_data {
2664         struct perf_event       *event;
2665         unsigned int            restart;
2666 };
2667
2668 static int __perf_event_stop(void *info)
2669 {
2670         struct stop_event_data *sd = info;
2671         struct perf_event *event = sd->event;
2672
2673         /* if it's already INACTIVE, do nothing */
2674         if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2675                 return 0;
2676
2677         /* matches smp_wmb() in event_sched_in() */
2678         smp_rmb();
2679
2680         /*
2681          * There is a window with interrupts enabled before we get here,
2682          * so we need to check again lest we try to stop another CPU's event.
2683          */
2684         if (READ_ONCE(event->oncpu) != smp_processor_id())
2685                 return -EAGAIN;
2686
2687         event->pmu->stop(event, PERF_EF_UPDATE);
2688
2689         /*
2690          * May race with the actual stop (through perf_pmu_output_stop()),
2691          * but it is only used for events with AUX ring buffer, and such
2692          * events will refuse to restart because of rb::aux_mmap_count==0,
2693          * see comments in perf_aux_output_begin().
2694          *
2695          * Since this is happening on a event-local CPU, no trace is lost
2696          * while restarting.
2697          */
2698         if (sd->restart)
2699                 event->pmu->start(event, 0);
2700
2701         return 0;
2702 }
2703
2704 static int perf_event_stop(struct perf_event *event, int restart)
2705 {
2706         struct stop_event_data sd = {
2707                 .event          = event,
2708                 .restart        = restart,
2709         };
2710         int ret = 0;
2711
2712         do {
2713                 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2714                         return 0;
2715
2716                 /* matches smp_wmb() in event_sched_in() */
2717                 smp_rmb();
2718
2719                 /*
2720                  * We only want to restart ACTIVE events, so if the event goes
2721                  * inactive here (event->oncpu==-1), there's nothing more to do;
2722                  * fall through with ret==-ENXIO.
2723                  */
2724                 ret = cpu_function_call(READ_ONCE(event->oncpu),
2725                                         __perf_event_stop, &sd);
2726         } while (ret == -EAGAIN);
2727
2728         return ret;
2729 }
2730
2731 /*
2732  * In order to contain the amount of racy and tricky in the address filter
2733  * configuration management, it is a two part process:
2734  *
2735  * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
2736  *      we update the addresses of corresponding vmas in
2737  *      event::addr_filters_offs array and bump the event::addr_filters_gen;
2738  * (p2) when an event is scheduled in (pmu::add), it calls
2739  *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
2740  *      if the generation has changed since the previous call.
2741  *
2742  * If (p1) happens while the event is active, we restart it to force (p2).
2743  *
2744  * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
2745  *     pre-existing mappings, called once when new filters arrive via SET_FILTER
2746  *     ioctl;
2747  * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
2748  *     registered mapping, called for every new mmap(), with mm::mmap_sem down
2749  *     for reading;
2750  * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
2751  *     of exec.
2752  */
2753 void perf_event_addr_filters_sync(struct perf_event *event)
2754 {
2755         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
2756
2757         if (!has_addr_filter(event))
2758                 return;
2759
2760         raw_spin_lock(&ifh->lock);
2761         if (event->addr_filters_gen != event->hw.addr_filters_gen) {
2762                 event->pmu->addr_filters_sync(event);
2763                 event->hw.addr_filters_gen = event->addr_filters_gen;
2764         }
2765         raw_spin_unlock(&ifh->lock);
2766 }
2767 EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
2768
2769 static int _perf_event_refresh(struct perf_event *event, int refresh)
2770 {
2771         /*
2772          * not supported on inherited events
2773          */
2774         if (event->attr.inherit || !is_sampling_event(event))
2775                 return -EINVAL;
2776
2777         atomic_add(refresh, &event->event_limit);
2778         _perf_event_enable(event);
2779
2780         return 0;
2781 }
2782
2783 /*
2784  * See perf_event_disable()
2785  */
2786 int perf_event_refresh(struct perf_event *event, int refresh)
2787 {
2788         struct perf_event_context *ctx;
2789         int ret;
2790
2791         ctx = perf_event_ctx_lock(event);
2792         ret = _perf_event_refresh(event, refresh);
2793         perf_event_ctx_unlock(event, ctx);
2794
2795         return ret;
2796 }
2797 EXPORT_SYMBOL_GPL(perf_event_refresh);
2798
2799 static void ctx_sched_out(struct perf_event_context *ctx,
2800                           struct perf_cpu_context *cpuctx,
2801                           enum event_type_t event_type)
2802 {
2803         int is_active = ctx->is_active;
2804         struct perf_event *event;
2805
2806         lockdep_assert_held(&ctx->lock);
2807
2808         if (likely(!ctx->nr_events)) {
2809                 /*
2810                  * See __perf_remove_from_context().
2811                  */
2812                 WARN_ON_ONCE(ctx->is_active);
2813                 if (ctx->task)
2814                         WARN_ON_ONCE(cpuctx->task_ctx);
2815                 return;
2816         }
2817
2818         ctx->is_active &= ~event_type;
2819         if (!(ctx->is_active & EVENT_ALL))
2820                 ctx->is_active = 0;
2821
2822         if (ctx->task) {
2823                 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2824                 if (!ctx->is_active)
2825                         cpuctx->task_ctx = NULL;
2826         }
2827
2828         /*
2829          * Always update time if it was set; not only when it changes.
2830          * Otherwise we can 'forget' to update time for any but the last
2831          * context we sched out. For example:
2832          *
2833          *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
2834          *   ctx_sched_out(.event_type = EVENT_PINNED)
2835          *
2836          * would only update time for the pinned events.
2837          */
2838         if (is_active & EVENT_TIME) {
2839                 /* update (and stop) ctx time */
2840                 update_context_time(ctx);
2841                 update_cgrp_time_from_cpuctx(cpuctx);
2842         }
2843
2844         is_active ^= ctx->is_active; /* changed bits */
2845
2846         if (!ctx->nr_active || !(is_active & EVENT_ALL))
2847                 return;
2848
2849         perf_pmu_disable(ctx->pmu);
2850         if (is_active & EVENT_PINNED) {
2851                 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
2852                         group_sched_out(event, cpuctx, ctx);
2853         }
2854
2855         if (is_active & EVENT_FLEXIBLE) {
2856                 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
2857                         group_sched_out(event, cpuctx, ctx);
2858         }
2859         perf_pmu_enable(ctx->pmu);
2860 }
2861
2862 /*
2863  * Test whether two contexts are equivalent, i.e. whether they have both been
2864  * cloned from the same version of the same context.
2865  *
2866  * Equivalence is measured using a generation number in the context that is
2867  * incremented on each modification to it; see unclone_ctx(), list_add_event()
2868  * and list_del_event().
2869  */
2870 static int context_equiv(struct perf_event_context *ctx1,
2871                          struct perf_event_context *ctx2)
2872 {
2873         lockdep_assert_held(&ctx1->lock);
2874         lockdep_assert_held(&ctx2->lock);
2875
2876         /* Pinning disables the swap optimization */
2877         if (ctx1->pin_count || ctx2->pin_count)
2878                 return 0;
2879
2880         /* If ctx1 is the parent of ctx2 */
2881         if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2882                 return 1;
2883
2884         /* If ctx2 is the parent of ctx1 */
2885         if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2886                 return 1;
2887
2888         /*
2889          * If ctx1 and ctx2 have the same parent; we flatten the parent
2890          * hierarchy, see perf_event_init_context().
2891          */
2892         if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2893                         ctx1->parent_gen == ctx2->parent_gen)
2894                 return 1;
2895
2896         /* Unmatched */
2897         return 0;
2898 }
2899
2900 static void __perf_event_sync_stat(struct perf_event *event,
2901                                      struct perf_event *next_event)
2902 {
2903         u64 value;
2904
2905         if (!event->attr.inherit_stat)
2906                 return;
2907
2908         /*
2909          * Update the event value, we cannot use perf_event_read()
2910          * because we're in the middle of a context switch and have IRQs
2911          * disabled, which upsets smp_call_function_single(), however
2912          * we know the event must be on the current CPU, therefore we
2913          * don't need to use it.
2914          */
2915         switch (event->state) {
2916         case PERF_EVENT_STATE_ACTIVE:
2917                 event->pmu->read(event);
2918                 /* fall-through */
2919
2920         case PERF_EVENT_STATE_INACTIVE:
2921                 update_event_times(event);
2922                 break;
2923
2924         default:
2925                 break;
2926         }
2927
2928         /*
2929          * In order to keep per-task stats reliable we need to flip the event
2930          * values when we flip the contexts.
2931          */
2932         value = local64_read(&next_event->count);
2933         value = local64_xchg(&event->count, value);
2934         local64_set(&next_event->count, value);
2935
2936         swap(event->total_time_enabled, next_event->total_time_enabled);
2937         swap(event->total_time_running, next_event->total_time_running);
2938
2939         /*
2940          * Since we swizzled the values, update the user visible data too.
2941          */
2942         perf_event_update_userpage(event);
2943         perf_event_update_userpage(next_event);
2944 }
2945
2946 static void perf_event_sync_stat(struct perf_event_context *ctx,
2947                                    struct perf_event_context *next_ctx)
2948 {
2949         struct perf_event *event, *next_event;
2950
2951         if (!ctx->nr_stat)
2952                 return;
2953
2954         update_context_time(ctx);
2955
2956         event = list_first_entry(&ctx->event_list,
2957                                    struct perf_event, event_entry);
2958
2959         next_event = list_first_entry(&next_ctx->event_list,
2960                                         struct perf_event, event_entry);
2961
2962         while (&event->event_entry != &ctx->event_list &&
2963                &next_event->event_entry != &next_ctx->event_list) {
2964
2965                 __perf_event_sync_stat(event, next_event);
2966
2967                 event = list_next_entry(event, event_entry);
2968                 next_event = list_next_entry(next_event, event_entry);
2969         }
2970 }
2971
2972 static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2973                                          struct task_struct *next)
2974 {
2975         struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2976         struct perf_event_context *next_ctx;
2977         struct perf_event_context *parent, *next_parent;
2978         struct perf_cpu_context *cpuctx;
2979         int do_switch = 1;
2980
2981         if (likely(!ctx))
2982                 return;
2983
2984         cpuctx = __get_cpu_context(ctx);
2985         if (!cpuctx->task_ctx)
2986                 return;
2987
2988         rcu_read_lock();
2989         next_ctx = next->perf_event_ctxp[ctxn];
2990         if (!next_ctx)
2991                 goto unlock;
2992
2993         parent = rcu_dereference(ctx->parent_ctx);
2994         next_parent = rcu_dereference(next_ctx->parent_ctx);
2995
2996         /* If neither context have a parent context; they cannot be clones. */
2997         if (!parent && !next_parent)
2998                 goto unlock;
2999
3000         if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
3001                 /*
3002                  * Looks like the two contexts are clones, so we might be
3003                  * able to optimize the context switch.  We lock both
3004                  * contexts and check that they are clones under the
3005                  * lock (including re-checking that neither has been
3006                  * uncloned in the meantime).  It doesn't matter which
3007                  * order we take the locks because no other cpu could
3008                  * be trying to lock both of these tasks.
3009                  */
3010                 raw_spin_lock(&ctx->lock);
3011                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
3012                 if (context_equiv(ctx, next_ctx)) {
3013                         WRITE_ONCE(ctx->task, next);
3014                         WRITE_ONCE(next_ctx->task, task);
3015
3016                         swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
3017
3018                         /*
3019                          * RCU_INIT_POINTER here is safe because we've not
3020                          * modified the ctx and the above modification of
3021                          * ctx->task and ctx->task_ctx_data are immaterial
3022                          * since those values are always verified under
3023                          * ctx->lock which we're now holding.
3024                          */
3025                         RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
3026                         RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
3027
3028                         do_switch = 0;
3029
3030                         perf_event_sync_stat(ctx, next_ctx);
3031                 }
3032                 raw_spin_unlock(&next_ctx->lock);
3033                 raw_spin_unlock(&ctx->lock);
3034         }
3035 unlock:
3036         rcu_read_unlock();
3037
3038         if (do_switch) {
3039                 raw_spin_lock(&ctx->lock);
3040                 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
3041                 raw_spin_unlock(&ctx->lock);
3042         }
3043 }
3044
3045 static DEFINE_PER_CPU(struct list_head, sched_cb_list);
3046
3047 void perf_sched_cb_dec(struct pmu *pmu)
3048 {
3049         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3050
3051         this_cpu_dec(perf_sched_cb_usages);
3052
3053         if (!--cpuctx->sched_cb_usage)
3054                 list_del(&cpuctx->sched_cb_entry);
3055 }
3056
3057
3058 void perf_sched_cb_inc(struct pmu *pmu)
3059 {
3060         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3061
3062         if (!cpuctx->sched_cb_usage++)
3063                 list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
3064
3065         this_cpu_inc(perf_sched_cb_usages);
3066 }
3067
3068 /*
3069  * This function provides the context switch callback to the lower code
3070  * layer. It is invoked ONLY when the context switch callback is enabled.
3071  *
3072  * This callback is relevant even to per-cpu events; for example multi event
3073  * PEBS requires this to provide PID/TID information. This requires we flush
3074  * all queued PEBS records before we context switch to a new task.
3075  */
3076 static void perf_pmu_sched_task(struct task_struct *prev,
3077                                 struct task_struct *next,
3078                                 bool sched_in)
3079 {
3080         struct perf_cpu_context *cpuctx;
3081         struct pmu *pmu;
3082
3083         if (prev == next)
3084                 return;
3085
3086         list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
3087                 pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
3088
3089                 if (WARN_ON_ONCE(!pmu->sched_task))
3090                         continue;
3091
3092                 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3093                 perf_pmu_disable(pmu);
3094
3095                 pmu->sched_task(cpuctx->task_ctx, sched_in);
3096
3097                 perf_pmu_enable(pmu);
3098                 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3099         }
3100 }
3101
3102 static void perf_event_switch(struct task_struct *task,
3103                               struct task_struct *next_prev, bool sched_in);
3104
3105 #define for_each_task_context_nr(ctxn)                                  \
3106         for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
3107
3108 /*
3109  * Called from scheduler to remove the events of the current task,
3110  * with interrupts disabled.
3111  *
3112  * We stop each event and update the event value in event->count.
3113  *
3114  * This does not protect us against NMI, but disable()
3115  * sets the disabled bit in the control field of event _before_
3116  * accessing the event control register. If a NMI hits, then it will
3117  * not restart the event.
3118  */
3119 void __perf_event_task_sched_out(struct task_struct *task,
3120                                  struct task_struct *next)
3121 {
3122         int ctxn;
3123
3124         if (__this_cpu_read(perf_sched_cb_usages))
3125                 perf_pmu_sched_task(task, next, false);
3126
3127         if (atomic_read(&nr_switch_events))
3128                 perf_event_switch(task, next, false);
3129
3130         for_each_task_context_nr(ctxn)
3131                 perf_event_context_sched_out(task, ctxn, next);
3132
3133         /*
3134          * if cgroup events exist on this CPU, then we need
3135          * to check if we have to switch out PMU state.
3136          * cgroup event are system-wide mode only
3137          */
3138         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3139                 perf_cgroup_sched_out(task, next);
3140 }
3141
3142 /*
3143  * Called with IRQs disabled
3144  */
3145 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
3146                               enum event_type_t event_type)
3147 {
3148         ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
3149 }
3150
3151 static void
3152 ctx_pinned_sched_in(struct perf_event_context *ctx,
3153                     struct perf_cpu_context *cpuctx)
3154 {
3155         struct perf_event *event;
3156
3157         list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
3158                 if (event->state <= PERF_EVENT_STATE_OFF)
3159                         continue;
3160                 if (!event_filter_match(event))
3161                         continue;
3162
3163                 /* may need to reset tstamp_enabled */
3164                 if (is_cgroup_event(event))
3165                         perf_cgroup_mark_enabled(event, ctx);
3166
3167                 if (group_can_go_on(event, cpuctx, 1))
3168                         group_sched_in(event, cpuctx, ctx);
3169
3170                 /*
3171                  * If this pinned group hasn't been scheduled,
3172                  * put it in error state.
3173                  */
3174                 if (event->state == PERF_EVENT_STATE_INACTIVE) {
3175                         update_group_times(event);
3176                         event->state = PERF_EVENT_STATE_ERROR;
3177                 }
3178         }
3179 }
3180
3181 static void
3182 ctx_flexible_sched_in(struct perf_event_context *ctx,
3183                       struct perf_cpu_context *cpuctx)
3184 {
3185         struct perf_event *event;
3186         int can_add_hw = 1;
3187
3188         list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
3189                 /* Ignore events in OFF or ERROR state */
3190                 if (event->state <= PERF_EVENT_STATE_OFF)
3191                         continue;
3192                 /*
3193                  * Listen to the 'cpu' scheduling filter constraint
3194                  * of events:
3195                  */
3196                 if (!event_filter_match(event))
3197                         continue;
3198
3199                 /* may need to reset tstamp_enabled */
3200                 if (is_cgroup_event(event))
3201                         perf_cgroup_mark_enabled(event, ctx);
3202
3203                 if (group_can_go_on(event, cpuctx, can_add_hw)) {
3204                         if (group_sched_in(event, cpuctx, ctx))
3205                                 can_add_hw = 0;
3206                 }
3207         }
3208 }
3209
3210 static void
3211 ctx_sched_in(struct perf_event_context *ctx,
3212              struct perf_cpu_context *cpuctx,
3213              enum event_type_t event_type,
3214              struct task_struct *task)
3215 {
3216         int is_active = ctx->is_active;
3217         u64 now;
3218
3219         lockdep_assert_held(&ctx->lock);
3220
3221         if (likely(!ctx->nr_events))
3222                 return;
3223
3224         ctx->is_active |= (event_type | EVENT_TIME);
3225         if (ctx->task) {
3226                 if (!is_active)
3227                         cpuctx->task_ctx = ctx;
3228                 else
3229                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3230         }
3231
3232         is_active ^= ctx->is_active; /* changed bits */
3233
3234         if (is_active & EVENT_TIME) {
3235                 /* start ctx time */
3236                 now = perf_clock();
3237                 ctx->timestamp = now;
3238                 perf_cgroup_set_timestamp(task, ctx);
3239         }
3240
3241         /*
3242          * First go through the list and put on any pinned groups
3243          * in order to give them the best chance of going on.
3244          */
3245         if (is_active & EVENT_PINNED)
3246                 ctx_pinned_sched_in(ctx, cpuctx);
3247
3248         /* Then walk through the lower prio flexible groups */
3249         if (is_active & EVENT_FLEXIBLE)
3250                 ctx_flexible_sched_in(ctx, cpuctx);
3251 }
3252
3253 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
3254                              enum event_type_t event_type,
3255                              struct task_struct *task)
3256 {
3257         struct perf_event_context *ctx = &cpuctx->ctx;
3258
3259         ctx_sched_in(ctx, cpuctx, event_type, task);
3260 }
3261
3262 static void perf_event_context_sched_in(struct perf_event_context *ctx,
3263                                         struct task_struct *task)
3264 {
3265         struct perf_cpu_context *cpuctx;
3266
3267         cpuctx = __get_cpu_context(ctx);
3268         if (cpuctx->task_ctx == ctx)
3269                 return;
3270
3271         perf_ctx_lock(cpuctx, ctx);
3272         /*
3273          * We must check ctx->nr_events while holding ctx->lock, such
3274          * that we serialize against perf_install_in_context().
3275          */
3276         if (!ctx->nr_events)
3277                 goto unlock;
3278
3279         perf_pmu_disable(ctx->pmu);
3280         /*
3281          * We want to keep the following priority order:
3282          * cpu pinned (that don't need to move), task pinned,
3283          * cpu flexible, task flexible.
3284          *
3285          * However, if task's ctx is not carrying any pinned
3286          * events, no need to flip the cpuctx's events around.
3287          */
3288         if (!list_empty(&ctx->pinned_groups))
3289                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3290         perf_event_sched_in(cpuctx, ctx, task);
3291         perf_pmu_enable(ctx->pmu);
3292
3293 unlock:
3294         perf_ctx_unlock(cpuctx, ctx);
3295 }
3296
3297 /*
3298  * Called from scheduler to add the events of the current task
3299  * with interrupts disabled.
3300  *
3301  * We restore the event value and then enable it.
3302  *
3303  * This does not protect us against NMI, but enable()
3304  * sets the enabled bit in the control field of event _before_
3305  * accessing the event control register. If a NMI hits, then it will
3306  * keep the event running.
3307  */
3308 void __perf_event_task_sched_in(struct task_struct *prev,
3309                                 struct task_struct *task)
3310 {
3311         struct perf_event_context *ctx;
3312         int ctxn;
3313
3314         /*
3315          * If cgroup events exist on this CPU, then we need to check if we have
3316          * to switch in PMU state; cgroup event are system-wide mode only.
3317          *
3318          * Since cgroup events are CPU events, we must schedule these in before
3319          * we schedule in the task events.
3320          */
3321         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3322                 perf_cgroup_sched_in(prev, task);
3323
3324         for_each_task_context_nr(ctxn) {
3325                 ctx = task->perf_event_ctxp[ctxn];
3326                 if (likely(!ctx))
3327                         continue;
3328
3329                 perf_event_context_sched_in(ctx, task);
3330         }
3331
3332         if (atomic_read(&nr_switch_events))
3333                 perf_event_switch(task, prev, true);
3334
3335         if (__this_cpu_read(perf_sched_cb_usages))
3336                 perf_pmu_sched_task(prev, task, true);
3337 }
3338
3339 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
3340 {
3341         u64 frequency = event->attr.sample_freq;
3342         u64 sec = NSEC_PER_SEC;
3343         u64 divisor, dividend;
3344
3345         int count_fls, nsec_fls, frequency_fls, sec_fls;
3346
3347         count_fls = fls64(count);
3348         nsec_fls = fls64(nsec);
3349         frequency_fls = fls64(frequency);
3350         sec_fls = 30;
3351
3352         /*
3353          * We got @count in @nsec, with a target of sample_freq HZ
3354          * the target period becomes:
3355          *
3356          *             @count * 10^9
3357          * period = -------------------
3358          *          @nsec * sample_freq
3359          *
3360          */
3361
3362         /*
3363          * Reduce accuracy by one bit such that @a and @b converge
3364          * to a similar magnitude.
3365          */
3366 #define REDUCE_FLS(a, b)                \
3367 do {                                    \
3368         if (a##_fls > b##_fls) {        \
3369                 a >>= 1;                \
3370                 a##_fls--;              \
3371         } else {                        \
3372                 b >>= 1;                \
3373                 b##_fls--;              \
3374         }                               \
3375 } while (0)
3376
3377         /*
3378          * Reduce accuracy until either term fits in a u64, then proceed with
3379          * the other, so that finally we can do a u64/u64 division.
3380          */
3381         while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
3382                 REDUCE_FLS(nsec, frequency);
3383                 REDUCE_FLS(sec, count);
3384         }
3385
3386         if (count_fls + sec_fls > 64) {
3387                 divisor = nsec * frequency;
3388
3389                 while (count_fls + sec_fls > 64) {
3390                         REDUCE_FLS(count, sec);
3391                         divisor >>= 1;
3392                 }
3393
3394                 dividend = count * sec;
3395         } else {
3396                 dividend = count * sec;
3397
3398                 while (nsec_fls + frequency_fls > 64) {
3399                         REDUCE_FLS(nsec, frequency);
3400                         dividend >>= 1;
3401                 }
3402
3403                 divisor = nsec * frequency;
3404         }
3405
3406         if (!divisor)
3407                 return dividend;
3408
3409         return div64_u64(dividend, divisor);
3410 }
3411
3412 static DEFINE_PER_CPU(int, perf_throttled_count);
3413 static DEFINE_PER_CPU(u64, perf_throttled_seq);
3414
3415 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
3416 {
3417         struct hw_perf_event *hwc = &event->hw;
3418         s64 period, sample_period;
3419         s64 delta;
3420
3421         period = perf_calculate_period(event, nsec, count);
3422
3423         delta = (s64)(period - hwc->sample_period);
3424         delta = (delta + 7) / 8; /* low pass filter */
3425
3426         sample_period = hwc->sample_period + delta;
3427
3428         if (!sample_period)
3429                 sample_period = 1;
3430
3431         hwc->sample_period = sample_period;
3432
3433         if (local64_read(&hwc->period_left) > 8*sample_period) {
3434                 if (disable)
3435                         event->pmu->stop(event, PERF_EF_UPDATE);
3436
3437                 local64_set(&hwc->period_left, 0);
3438
3439                 if (disable)
3440                         event->pmu->start(event, PERF_EF_RELOAD);
3441         }
3442 }
3443
3444 /*
3445  * combine freq adjustment with unthrottling to avoid two passes over the
3446  * events. At the same time, make sure, having freq events does not change
3447  * the rate of unthrottling as that would introduce bias.
3448  */
3449 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
3450                                            int needs_unthr)
3451 {
3452         struct perf_event *event;
3453         struct hw_perf_event *hwc;
3454         u64 now, period = TICK_NSEC;
3455         s64 delta;
3456
3457         /*
3458          * only need to iterate over all events iff:
3459          * - context have events in frequency mode (needs freq adjust)
3460          * - there are events to unthrottle on this cpu
3461          */
3462         if (!(ctx->nr_freq || needs_unthr))
3463                 return;
3464
3465         raw_spin_lock(&ctx->lock);
3466         perf_pmu_disable(ctx->pmu);
3467
3468         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3469                 if (event->state != PERF_EVENT_STATE_ACTIVE)
3470                         continue;
3471
3472                 if (!event_filter_match(event))
3473                         continue;
3474
3475                 perf_pmu_disable(event->pmu);
3476
3477                 hwc = &event->hw;
3478
3479                 if (hwc->interrupts == MAX_INTERRUPTS) {
3480                         hwc->interrupts = 0;
3481                         perf_log_throttle(event, 1);
3482                         event->pmu->start(event, 0);
3483                 }
3484
3485                 if (!event->attr.freq || !event->attr.sample_freq)
3486                         goto next;
3487
3488                 /*
3489                  * stop the event and update event->count
3490                  */
3491                 event->pmu->stop(event, PERF_EF_UPDATE);
3492
3493                 now = local64_read(&event->count);
3494                 delta = now - hwc->freq_count_stamp;
3495                 hwc->freq_count_stamp = now;
3496
3497                 /*
3498                  * restart the event
3499                  * reload only if value has changed
3500                  * we have stopped the event so tell that
3501                  * to perf_adjust_period() to avoid stopping it
3502                  * twice.
3503                  */
3504                 if (delta > 0)
3505                         perf_adjust_period(event, period, delta, false);
3506
3507                 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
3508         next:
3509                 perf_pmu_enable(event->pmu);
3510         }
3511
3512         perf_pmu_enable(ctx->pmu);
3513         raw_spin_unlock(&ctx->lock);
3514 }
3515
3516 /*
3517  * Round-robin a context's events:
3518  */
3519 static void rotate_ctx(struct perf_event_context *ctx)
3520 {
3521         /*
3522          * Rotate the first entry last of non-pinned groups. Rotation might be
3523          * disabled by the inheritance code.
3524          */
3525         if (!ctx->rotate_disable)
3526                 list_rotate_left(&ctx->flexible_groups);
3527 }
3528
3529 static int perf_rotate_context(struct perf_cpu_context *cpuctx)
3530 {
3531         struct perf_event_context *ctx = NULL;
3532         int rotate = 0;
3533
3534         if (cpuctx->ctx.nr_events) {
3535                 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3536                         rotate = 1;
3537         }
3538
3539         ctx = cpuctx->task_ctx;
3540         if (ctx && ctx->nr_events) {
3541                 if (ctx->nr_events != ctx->nr_active)
3542                         rotate = 1;
3543         }
3544
3545         if (!rotate)
3546                 goto done;
3547
3548         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3549         perf_pmu_disable(cpuctx->ctx.pmu);
3550
3551         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3552         if (ctx)
3553                 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
3554
3555         rotate_ctx(&cpuctx->ctx);
3556         if (ctx)
3557                 rotate_ctx(ctx);
3558
3559         perf_event_sched_in(cpuctx, ctx, current);
3560
3561         perf_pmu_enable(cpuctx->ctx.pmu);
3562         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3563 done:
3564
3565         return rotate;
3566 }
3567
3568 void perf_event_task_tick(void)
3569 {
3570         struct list_head *head = this_cpu_ptr(&active_ctx_list);
3571         struct perf_event_context *ctx, *tmp;
3572         int throttled;
3573
3574         WARN_ON(!irqs_disabled());
3575
3576         __this_cpu_inc(perf_throttled_seq);
3577         throttled = __this_cpu_xchg(perf_throttled_count, 0);
3578         tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
3579
3580         list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
3581                 perf_adjust_freq_unthr_context(ctx, throttled);
3582 }
3583
3584 static int event_enable_on_exec(struct perf_event *event,
3585                                 struct perf_event_context *ctx)
3586 {
3587         if (!event->attr.enable_on_exec)
3588                 return 0;
3589
3590         event->attr.enable_on_exec = 0;
3591         if (event->state >= PERF_EVENT_STATE_INACTIVE)
3592                 return 0;
3593
3594         __perf_event_mark_enabled(event);
3595
3596         return 1;
3597 }
3598
3599 /*
3600  * Enable all of a task's events that have been marked enable-on-exec.
3601  * This expects task == current.
3602  */
3603 static void perf_event_enable_on_exec(int ctxn)
3604 {
3605         struct perf_event_context *ctx, *clone_ctx = NULL;
3606         enum event_type_t event_type = 0;
3607         struct perf_cpu_context *cpuctx;
3608         struct perf_event *event;
3609         unsigned long flags;
3610         int enabled = 0;
3611
3612         local_irq_save(flags);
3613         ctx = current->perf_event_ctxp[ctxn];
3614         if (!ctx || !ctx->nr_events)
3615                 goto out;
3616
3617         cpuctx = __get_cpu_context(ctx);
3618         perf_ctx_lock(cpuctx, ctx);
3619         ctx_sched_out(ctx, cpuctx, EVENT_TIME);
3620         list_for_each_entry(event, &ctx->event_list, event_entry) {
3621                 enabled |= event_enable_on_exec(event, ctx);
3622                 event_type |= get_event_type(event);
3623         }
3624
3625         /*
3626          * Unclone and reschedule this context if we enabled any event.
3627          */
3628         if (enabled) {
3629                 clone_ctx = unclone_ctx(ctx);
3630                 ctx_resched(cpuctx, ctx, event_type);
3631         } else {
3632                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
3633         }
3634         perf_ctx_unlock(cpuctx, ctx);
3635
3636 out:
3637         local_irq_restore(flags);
3638
3639         if (clone_ctx)
3640                 put_ctx(clone_ctx);
3641 }
3642
3643 struct perf_read_data {
3644         struct perf_event *event;
3645         bool group;
3646         int ret;
3647 };
3648
3649 static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
3650 {
3651         u16 local_pkg, event_pkg;
3652
3653         if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
3654                 int local_cpu = smp_processor_id();
3655
3656                 event_pkg = topology_physical_package_id(event_cpu);
3657                 local_pkg = topology_physical_package_id(local_cpu);
3658
3659                 if (event_pkg == local_pkg)
3660                         return local_cpu;
3661         }
3662
3663         return event_cpu;
3664 }
3665
3666 /*
3667  * Cross CPU call to read the hardware event
3668  */
3669 static void __perf_event_read(void *info)
3670 {
3671         struct perf_read_data *data = info;
3672         struct perf_event *sub, *event = data->event;
3673         struct perf_event_context *ctx = event->ctx;
3674         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3675         struct pmu *pmu = event->pmu;
3676
3677         /*
3678          * If this is a task context, we need to check whether it is
3679          * the current task context of this cpu.  If not it has been
3680          * scheduled out before the smp call arrived.  In that case
3681          * event->count would have been updated to a recent sample
3682          * when the event was scheduled out.
3683          */
3684         if (ctx->task && cpuctx->task_ctx != ctx)
3685                 return;
3686
3687         raw_spin_lock(&ctx->lock);
3688         if (ctx->is_active) {
3689                 update_context_time(ctx);
3690                 update_cgrp_time_from_event(event);
3691         }
3692
3693         update_event_times(event);
3694         if (event->state != PERF_EVENT_STATE_ACTIVE)
3695                 goto unlock;
3696
3697         if (!data->group) {
3698                 pmu->read(event);
3699                 data->ret = 0;
3700                 goto unlock;
3701         }
3702
3703         pmu->start_txn(pmu, PERF_PMU_TXN_READ);
3704
3705         pmu->read(event);
3706
3707         list_for_each_entry(sub, &event->sibling_list, group_entry) {
3708                 update_event_times(sub);
3709                 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
3710                         /*
3711                          * Use sibling's PMU rather than @event's since
3712                          * sibling could be on different (eg: software) PMU.
3713                          */
3714                         sub->pmu->read(sub);
3715                 }
3716         }
3717
3718         data->ret = pmu->commit_txn(pmu);
3719
3720 unlock:
3721         raw_spin_unlock(&ctx->lock);
3722 }
3723
3724 static inline u64 perf_event_count(struct perf_event *event)
3725 {
3726         return local64_read(&event->count) + atomic64_read(&event->child_count);
3727 }
3728
3729 /*
3730  * NMI-safe method to read a local event, that is an event that
3731  * is:
3732  *   - either for the current task, or for this CPU
3733  *   - does not have inherit set, for inherited task events
3734  *     will not be local and we cannot read them atomically
3735  *   - must not have a pmu::count method
3736  */
3737 int perf_event_read_local(struct perf_event *event, u64 *value)
3738 {
3739         unsigned long flags;
3740         int ret = 0;
3741
3742         /*
3743          * Disabling interrupts avoids all counter scheduling (context
3744          * switches, timer based rotation and IPIs).
3745          */
3746         local_irq_save(flags);
3747
3748         /*
3749          * It must not be an event with inherit set, we cannot read
3750          * all child counters from atomic context.
3751          */
3752         if (event->attr.inherit) {
3753                 ret = -EOPNOTSUPP;
3754                 goto out;
3755         }
3756
3757         /* If this is a per-task event, it must be for current */
3758         if ((event->attach_state & PERF_ATTACH_TASK) &&
3759             event->hw.target != current) {
3760                 ret = -EINVAL;
3761                 goto out;
3762         }
3763
3764         /* If this is a per-CPU event, it must be for this CPU */
3765         if (!(event->attach_state & PERF_ATTACH_TASK) &&
3766             event->cpu != smp_processor_id()) {
3767                 ret = -EINVAL;
3768                 goto out;
3769         }
3770
3771         /* If this is a pinned event it must be running on this CPU */
3772         if (event->attr.pinned && event->oncpu != smp_processor_id()) {
3773                 ret = -EBUSY;
3774                 goto out;
3775         }
3776
3777         /*
3778          * If the event is currently on this CPU, its either a per-task event,
3779          * or local to this CPU. Furthermore it means its ACTIVE (otherwise
3780          * oncpu == -1).
3781          */
3782         if (event->oncpu == smp_processor_id())
3783                 event->pmu->read(event);
3784
3785         *value = local64_read(&event->count);
3786 out:
3787         local_irq_restore(flags);
3788
3789         return ret;
3790 }
3791
3792 static int perf_event_read(struct perf_event *event, bool group)
3793 {
3794         int event_cpu, ret = 0;
3795
3796         /*
3797          * If event is enabled and currently active on a CPU, update the
3798          * value in the event structure:
3799          */
3800         if (event->state == PERF_EVENT_STATE_ACTIVE) {
3801                 struct perf_read_data data = {
3802                         .event = event,
3803                         .group = group,
3804                         .ret = 0,
3805                 };
3806
3807                 event_cpu = READ_ONCE(event->oncpu);
3808                 if ((unsigned)event_cpu >= nr_cpu_ids)
3809                         return 0;
3810
3811                 preempt_disable();
3812                 event_cpu = __perf_event_read_cpu(event, event_cpu);
3813
3814                 /*
3815                  * Purposely ignore the smp_call_function_single() return
3816                  * value.
3817                  *
3818                  * If event_cpu isn't a valid CPU it means the event got
3819                  * scheduled out and that will have updated the event count.
3820                  *
3821                  * Therefore, either way, we'll have an up-to-date event count
3822                  * after this.
3823                  */
3824                 (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
3825                 preempt_enable();
3826                 ret = data.ret;
3827         } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
3828                 struct perf_event_context *ctx = event->ctx;
3829                 unsigned long flags;
3830
3831                 raw_spin_lock_irqsave(&ctx->lock, flags);
3832                 /*
3833                  * may read while context is not active
3834                  * (e.g., thread is blocked), in that case
3835                  * we cannot update context time
3836                  */
3837                 if (ctx->is_active) {
3838                         update_context_time(ctx);
3839                         update_cgrp_time_from_event(event);
3840                 }
3841                 if (group)
3842                         update_group_times(event);
3843                 else
3844                         update_event_times(event);
3845                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3846         }
3847
3848         return ret;
3849 }
3850
3851 /*
3852  * Initialize the perf_event context in a task_struct:
3853  */
3854 static void __perf_event_init_context(struct perf_event_context *ctx)
3855 {
3856         raw_spin_lock_init(&ctx->lock);
3857         mutex_init(&ctx->mutex);
3858         INIT_LIST_HEAD(&ctx->active_ctx_list);
3859         INIT_LIST_HEAD(&ctx->pinned_groups);
3860         INIT_LIST_HEAD(&ctx->flexible_groups);
3861         INIT_LIST_HEAD(&ctx->event_list);
3862         atomic_set(&ctx->refcount, 1);
3863 }
3864
3865 static struct perf_event_context *
3866 alloc_perf_context(struct pmu *pmu, struct task_struct *task)
3867 {
3868         struct perf_event_context *ctx;
3869
3870         ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
3871         if (!ctx)
3872                 return NULL;
3873
3874         __perf_event_init_context(ctx);
3875         if (task) {
3876                 ctx->task = task;
3877                 get_task_struct(task);
3878         }
3879         ctx->pmu = pmu;
3880
3881         return ctx;
3882 }
3883
3884 static struct task_struct *
3885 find_lively_task_by_vpid(pid_t vpid)
3886 {
3887         struct task_struct *task;
3888
3889         rcu_read_lock();
3890         if (!vpid)
3891                 task = current;
3892         else
3893                 task = find_task_by_vpid(vpid);
3894         if (task)
3895                 get_task_struct(task);
3896         rcu_read_unlock();
3897
3898         if (!task)
3899                 return ERR_PTR(-ESRCH);
3900
3901         return task;
3902 }
3903
3904 /*
3905  * Returns a matching context with refcount and pincount.
3906  */
3907 static struct perf_event_context *
3908 find_get_context(struct pmu *pmu, struct task_struct *task,
3909                 struct perf_event *event)
3910 {
3911         struct perf_event_context *ctx, *clone_ctx = NULL;
3912         struct perf_cpu_context *cpuctx;
3913         void *task_ctx_data = NULL;
3914         unsigned long flags;
3915         int ctxn, err;
3916         int cpu = event->cpu;
3917
3918         if (!task) {
3919                 /* Must be root to operate on a CPU event: */
3920                 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
3921                         return ERR_PTR(-EACCES);
3922
3923                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
3924                 ctx = &cpuctx->ctx;
3925                 get_ctx(ctx);
3926                 raw_spin_lock_irqsave(&ctx->lock, flags);
3927                 ++ctx->pin_count;
3928                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3929
3930                 return ctx;
3931         }
3932
3933         err = -EINVAL;
3934         ctxn = pmu->task_ctx_nr;
3935         if (ctxn < 0)
3936                 goto errout;
3937
3938         if (event->attach_state & PERF_ATTACH_TASK_DATA) {
3939                 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
3940                 if (!task_ctx_data) {
3941                         err = -ENOMEM;
3942                         goto errout;
3943                 }
3944         }
3945
3946 retry:
3947         ctx = perf_lock_task_context(task, ctxn, &flags);
3948         if (ctx) {
3949                 clone_ctx = unclone_ctx(ctx);
3950                 ++ctx->pin_count;
3951
3952                 if (task_ctx_data && !ctx->task_ctx_data) {
3953                         ctx->task_ctx_data = task_ctx_data;
3954                         task_ctx_data = NULL;
3955                 }
3956                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3957
3958                 if (clone_ctx)
3959                         put_ctx(clone_ctx);
3960         } else {
3961                 ctx = alloc_perf_context(pmu, task);
3962                 err = -ENOMEM;
3963                 if (!ctx)
3964                         goto errout;
3965
3966                 if (task_ctx_data) {
3967                         ctx->task_ctx_data = task_ctx_data;
3968                         task_ctx_data = NULL;
3969                 }
3970
3971                 err = 0;
3972                 mutex_lock(&task->perf_event_mutex);
3973                 /*
3974                  * If it has already passed perf_event_exit_task().
3975                  * we must see PF_EXITING, it takes this mutex too.
3976                  */
3977                 if (task->flags & PF_EXITING)
3978                         err = -ESRCH;
3979                 else if (task->perf_event_ctxp[ctxn])
3980                         err = -EAGAIN;
3981                 else {
3982                         get_ctx(ctx);
3983                         ++ctx->pin_count;
3984                         rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
3985                 }
3986                 mutex_unlock(&task->perf_event_mutex);
3987
3988                 if (unlikely(err)) {
3989                         put_ctx(ctx);
3990
3991                         if (err == -EAGAIN)
3992                                 goto retry;
3993                         goto errout;
3994                 }
3995         }
3996
3997         kfree(task_ctx_data);
3998         return ctx;
3999
4000 errout:
4001         kfree(task_ctx_data);
4002         return ERR_PTR(err);
4003 }
4004
4005 static void perf_event_free_filter(struct perf_event *event);
4006 static void perf_event_free_bpf_prog(struct perf_event *event);
4007
4008 static void free_event_rcu(struct rcu_head *head)
4009 {
4010         struct perf_event *event;
4011
4012         event = container_of(head, struct perf_event, rcu_head);
4013         if (event->ns)
4014                 put_pid_ns(event->ns);
4015         perf_event_free_filter(event);
4016         kfree(event);
4017 }
4018
4019 static void ring_buffer_attach(struct perf_event *event,
4020                                struct ring_buffer *rb);
4021
4022 static void detach_sb_event(struct perf_event *event)
4023 {
4024         struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
4025
4026         raw_spin_lock(&pel->lock);
4027         list_del_rcu(&event->sb_list);
4028         raw_spin_unlock(&pel->lock);
4029 }
4030
4031 static bool is_sb_event(struct perf_event *event)
4032 {
4033         struct perf_event_attr *attr = &event->attr;
4034
4035         if (event->parent)
4036                 return false;
4037
4038         if (event->attach_state & PERF_ATTACH_TASK)
4039                 return false;
4040
4041         if (attr->mmap || attr->mmap_data || attr->mmap2 ||
4042             attr->comm || attr->comm_exec ||
4043             attr->task ||
4044             attr->context_switch)
4045                 return true;
4046         return false;
4047 }
4048
4049 static void unaccount_pmu_sb_event(struct perf_event *event)
4050 {
4051         if (is_sb_event(event))
4052                 detach_sb_event(event);
4053 }
4054
4055 static void unaccount_event_cpu(struct perf_event *event, int cpu)
4056 {
4057         if (event->parent)
4058                 return;
4059
4060         if (is_cgroup_event(event))
4061                 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
4062 }
4063
4064 #ifdef CONFIG_NO_HZ_FULL
4065 static DEFINE_SPINLOCK(nr_freq_lock);
4066 #endif
4067
4068 static void unaccount_freq_event_nohz(void)
4069 {
4070 #ifdef CONFIG_NO_HZ_FULL
4071         spin_lock(&nr_freq_lock);
4072         if (atomic_dec_and_test(&nr_freq_events))
4073                 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
4074         spin_unlock(&nr_freq_lock);
4075 #endif
4076 }
4077
4078 static void unaccount_freq_event(void)
4079 {
4080         if (tick_nohz_full_enabled())
4081                 unaccount_freq_event_nohz();
4082         else
4083                 atomic_dec(&nr_freq_events);
4084 }
4085
4086 static void unaccount_event(struct perf_event *event)
4087 {
4088         bool dec = false;
4089
4090         if (event->parent)
4091                 return;
4092
4093         if (event->attach_state & PERF_ATTACH_TASK)
4094                 dec = true;
4095         if (event->attr.mmap || event->attr.mmap_data)
4096                 atomic_dec(&nr_mmap_events);
4097         if (event->attr.comm)
4098                 atomic_dec(&nr_comm_events);
4099         if (event->attr.namespaces)
4100                 atomic_dec(&nr_namespaces_events);
4101         if (event->attr.task)
4102                 atomic_dec(&nr_task_events);
4103         if (event->attr.freq)
4104                 unaccount_freq_event();
4105         if (event->attr.context_switch) {
4106                 dec = true;
4107                 atomic_dec(&nr_switch_events);
4108         }
4109         if (is_cgroup_event(event))
4110                 dec = true;
4111         if (has_branch_stack(event))
4112                 dec = true;
4113
4114         if (dec) {
4115                 if (!atomic_add_unless(&perf_sched_count, -1, 1))
4116                         schedule_delayed_work(&perf_sched_work, HZ);
4117         }
4118
4119         unaccount_event_cpu(event, event->cpu);
4120
4121         unaccount_pmu_sb_event(event);
4122 }
4123
4124 static void perf_sched_delayed(struct work_struct *work)
4125 {
4126         mutex_lock(&perf_sched_mutex);
4127         if (atomic_dec_and_test(&perf_sched_count))
4128                 static_branch_disable(&perf_sched_events);
4129         mutex_unlock(&perf_sched_mutex);
4130 }
4131
4132 /*
4133  * The following implement mutual exclusion of events on "exclusive" pmus
4134  * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
4135  * at a time, so we disallow creating events that might conflict, namely:
4136  *
4137  *  1) cpu-wide events in the presence of per-task events,
4138  *  2) per-task events in the presence of cpu-wide events,
4139  *  3) two matching events on the same context.
4140  *
4141  * The former two cases are handled in the allocation path (perf_event_alloc(),
4142  * _free_event()), the latter -- before the first perf_install_in_context().
4143  */
4144 static int exclusive_event_init(struct perf_event *event)
4145 {
4146         struct pmu *pmu = event->pmu;
4147
4148         if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
4149                 return 0;
4150
4151         /*
4152          * Prevent co-existence of per-task and cpu-wide events on the
4153          * same exclusive pmu.
4154          *
4155          * Negative pmu::exclusive_cnt means there are cpu-wide
4156          * events on this "exclusive" pmu, positive means there are
4157          * per-task events.
4158          *
4159          * Since this is called in perf_event_alloc() path, event::ctx
4160          * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
4161          * to mean "per-task event", because unlike other attach states it
4162          * never gets cleared.
4163          */
4164         if (event->attach_state & PERF_ATTACH_TASK) {
4165                 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
4166                         return -EBUSY;
4167         } else {
4168                 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
4169                         return -EBUSY;
4170         }
4171
4172         return 0;
4173 }
4174
4175 static void exclusive_event_destroy(struct perf_event *event)
4176 {
4177         struct pmu *pmu = event->pmu;
4178
4179         if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
4180                 return;
4181
4182         /* see comment in exclusive_event_init() */
4183         if (event->attach_state & PERF_ATTACH_TASK)
4184                 atomic_dec(&pmu->exclusive_cnt);
4185         else
4186                 atomic_inc(&pmu->exclusive_cnt);
4187 }
4188
4189 static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
4190 {
4191         if ((e1->pmu == e2->pmu) &&
4192             (e1->cpu == e2->cpu ||
4193              e1->cpu == -1 ||
4194              e2->cpu == -1))
4195                 return true;
4196         return false;
4197 }
4198
4199 /* Called under the same ctx::mutex as perf_install_in_context() */
4200 static bool exclusive_event_installable(struct perf_event *event,
4201                                         struct perf_event_context *ctx)
4202 {
4203         struct perf_event *iter_event;
4204         struct pmu *pmu = event->pmu;
4205
4206         if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
4207                 return true;
4208
4209         list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
4210                 if (exclusive_event_match(iter_event, event))
4211                         return false;
4212         }
4213
4214         return true;
4215 }
4216
4217 static void perf_addr_filters_splice(struct perf_event *event,
4218                                        struct list_head *head);
4219
4220 static void _free_event(struct perf_event *event)
4221 {
4222         irq_work_sync(&event->pending);
4223
4224         unaccount_event(event);
4225
4226         if (event->rb) {
4227                 /*
4228                  * Can happen when we close an event with re-directed output.
4229                  *
4230                  * Since we have a 0 refcount, perf_mmap_close() will skip
4231                  * over us; possibly making our ring_buffer_put() the last.
4232                  */
4233                 mutex_lock(&event->mmap_mutex);
4234                 ring_buffer_attach(event, NULL);
4235                 mutex_unlock(&event->mmap_mutex);
4236         }
4237
4238         if (is_cgroup_event(event))
4239                 perf_detach_cgroup(event);
4240
4241         if (!event->parent) {
4242                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
4243                         put_callchain_buffers();
4244         }
4245
4246         perf_event_free_bpf_prog(event);
4247         perf_addr_filters_splice(event, NULL);
4248         kfree(event->addr_filters_offs);
4249
4250         if (event->destroy)
4251                 event->destroy(event);
4252
4253         if (event->ctx)
4254                 put_ctx(event->ctx);
4255
4256         if (event->hw.target)
4257                 put_task_struct(event->hw.target);
4258
4259         exclusive_event_destroy(event);
4260         module_put(event->pmu->module);
4261
4262         call_rcu(&event->rcu_head, free_event_rcu);
4263 }
4264
4265 /*
4266  * Used to free events which have a known refcount of 1, such as in error paths
4267  * where the event isn't exposed yet and inherited events.
4268  */
4269 static void free_event(struct perf_event *event)
4270 {
4271         if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
4272                                 "unexpected event refcount: %ld; ptr=%p\n",
4273                                 atomic_long_read(&event->refcount), event)) {
4274                 /* leak to avoid use-after-free */
4275                 return;
4276         }
4277
4278         _free_event(event);
4279 }
4280
4281 /*
4282  * Remove user event from the owner task.
4283  */
4284 static void perf_remove_from_owner(struct perf_event *event)
4285 {
4286         struct task_struct *owner;
4287
4288         rcu_read_lock();
4289         /*
4290          * Matches the smp_store_release() in perf_event_exit_task(). If we
4291          * observe !owner it means the list deletion is complete and we can
4292          * indeed free this event, otherwise we need to serialize on
4293          * owner->perf_event_mutex.
4294          */
4295         owner = READ_ONCE(event->owner);
4296         if (owner) {
4297                 /*
4298                  * Since delayed_put_task_struct() also drops the last
4299                  * task reference we can safely take a new reference
4300                  * while holding the rcu_read_lock().
4301                  */
4302                 get_task_struct(owner);
4303         }
4304         rcu_read_unlock();
4305
4306         if (owner) {
4307                 /*
4308                  * If we're here through perf_event_exit_task() we're already
4309                  * holding ctx->mutex which would be an inversion wrt. the
4310                  * normal lock order.
4311                  *
4312                  * However we can safely take this lock because its the child
4313                  * ctx->mutex.
4314                  */
4315                 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
4316
4317                 /*
4318                  * We have to re-check the event->owner field, if it is cleared
4319                  * we raced with perf_event_exit_task(), acquiring the mutex
4320                  * ensured they're done, and we can proceed with freeing the
4321                  * event.
4322                  */
4323                 if (event->owner) {
4324                         list_del_init(&event->owner_entry);
4325                         smp_store_release(&event->owner, NULL);
4326                 }
4327                 mutex_unlock(&owner->perf_event_mutex);
4328                 put_task_struct(owner);
4329         }
4330 }
4331
4332 static void put_event(struct perf_event *event)
4333 {
4334         if (!atomic_long_dec_and_test(&event->refcount))
4335                 return;
4336
4337         _free_event(event);
4338 }
4339
4340 /*
4341  * Kill an event dead; while event:refcount will preserve the event
4342  * object, it will not preserve its functionality. Once the last 'user'
4343  * gives up the object, we'll destroy the thing.
4344  */
4345 int perf_event_release_kernel(struct perf_event *event)
4346 {
4347         struct perf_event_context *ctx = event->ctx;
4348         struct perf_event *child, *tmp;
4349
4350         /*
4351          * If we got here through err_file: fput(event_file); we will not have
4352          * attached to a context yet.
4353          */
4354         if (!ctx) {
4355                 WARN_ON_ONCE(event->attach_state &
4356                                 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
4357                 goto no_ctx;
4358         }
4359
4360         if (!is_kernel_event(event))
4361                 perf_remove_from_owner(event);
4362
4363         ctx = perf_event_ctx_lock(event);
4364         WARN_ON_ONCE(ctx->parent_ctx);
4365         perf_remove_from_context(event, DETACH_GROUP);
4366
4367         raw_spin_lock_irq(&ctx->lock);
4368         /*
4369          * Mark this event as STATE_DEAD, there is no external reference to it
4370          * anymore.
4371          *
4372          * Anybody acquiring event->child_mutex after the below loop _must_
4373          * also see this, most importantly inherit_event() which will avoid
4374          * placing more children on the list.
4375          *
4376          * Thus this guarantees that we will in fact observe and kill _ALL_
4377          * child events.
4378          */
4379         event->state = PERF_EVENT_STATE_DEAD;
4380         raw_spin_unlock_irq(&ctx->lock);
4381
4382         perf_event_ctx_unlock(event, ctx);
4383
4384 again:
4385         mutex_lock(&event->child_mutex);
4386         list_for_each_entry(child, &event->child_list, child_list) {
4387
4388                 /*
4389                  * Cannot change, child events are not migrated, see the
4390                  * comment with perf_event_ctx_lock_nested().
4391                  */
4392                 ctx = READ_ONCE(child->ctx);
4393                 /*
4394                  * Since child_mutex nests inside ctx::mutex, we must jump
4395                  * through hoops. We start by grabbing a reference on the ctx.
4396                  *
4397                  * Since the event cannot get freed while we hold the
4398                  * child_mutex, the context must also exist and have a !0
4399                  * reference count.
4400                  */
4401                 get_ctx(ctx);
4402
4403                 /*
4404                  * Now that we have a ctx ref, we can drop child_mutex, and
4405                  * acquire ctx::mutex without fear of it going away. Then we
4406                  * can re-acquire child_mutex.
4407                  */
4408                 mutex_unlock(&event->child_mutex);
4409                 mutex_lock(&ctx->mutex);
4410                 mutex_lock(&event->child_mutex);
4411
4412                 /*
4413                  * Now that we hold ctx::mutex and child_mutex, revalidate our
4414                  * state, if child is still the first entry, it didn't get freed
4415                  * and we can continue doing so.
4416                  */
4417                 tmp = list_first_entry_or_null(&event->child_list,
4418                                                struct perf_event, child_list);
4419                 if (tmp == child) {
4420                         perf_remove_from_context(child, DETACH_GROUP);
4421                         list_del(&child->child_list);
4422                         free_event(child);
4423                         /*
4424                          * This matches the refcount bump in inherit_event();
4425                          * this can't be the last reference.
4426                          */
4427                         put_event(event);
4428                 }
4429
4430                 mutex_unlock(&event->child_mutex);
4431                 mutex_unlock(&ctx->mutex);
4432                 put_ctx(ctx);
4433                 goto again;
4434         }
4435         mutex_unlock(&event->child_mutex);
4436
4437 no_ctx:
4438         put_event(event); /* Must be the 'last' reference */
4439         return 0;
4440 }
4441 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
4442
4443 /*
4444  * Called when the last reference to the file is gone.
4445  */
4446 static int perf_release(struct inode *inode, struct file *file)
4447 {
4448         perf_event_release_kernel(file->private_data);
4449         return 0;
4450 }
4451
4452 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
4453 {
4454         struct perf_event *child;
4455         u64 total = 0;
4456
4457         *enabled = 0;
4458         *running = 0;
4459
4460         mutex_lock(&event->child_mutex);
4461
4462         (void)perf_event_read(event, false);
4463         total += perf_event_count(event);
4464
4465         *enabled += event->total_time_enabled +
4466                         atomic64_read(&event->child_total_time_enabled);
4467         *running += event->total_time_running +
4468                         atomic64_read(&event->child_total_time_running);
4469
4470         list_for_each_entry(child, &event->child_list, child_list) {
4471                 (void)perf_event_read(child, false);
4472                 total += perf_event_count(child);
4473                 *enabled += child->total_time_enabled;
4474                 *running += child->total_time_running;
4475         }
4476         mutex_unlock(&event->child_mutex);
4477
4478         return total;
4479 }
4480 EXPORT_SYMBOL_GPL(perf_event_read_value);
4481
4482 static int __perf_read_group_add(struct perf_event *leader,
4483                                         u64 read_format, u64 *values)
4484 {
4485         struct perf_event_context *ctx = leader->ctx;
4486         struct perf_event *sub;
4487         unsigned long flags;
4488         int n = 1; /* skip @nr */
4489         int ret;
4490
4491         ret = perf_event_read(leader, true);
4492         if (ret)
4493                 return ret;
4494
4495         raw_spin_lock_irqsave(&ctx->lock, flags);
4496
4497         /*
4498          * Since we co-schedule groups, {enabled,running} times of siblings
4499          * will be identical to those of the leader, so we only publish one
4500          * set.
4501          */
4502         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
4503                 values[n++] += leader->total_time_enabled +
4504                         atomic64_read(&leader->child_total_time_enabled);
4505         }
4506
4507         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
4508                 values[n++] += leader->total_time_running +
4509                         atomic64_read(&leader->child_total_time_running);
4510         }
4511
4512         /*
4513          * Write {count,id} tuples for every sibling.
4514          */
4515         values[n++] += perf_event_count(leader);
4516         if (read_format & PERF_FORMAT_ID)
4517                 values[n++] = primary_event_id(leader);
4518
4519         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4520                 values[n++] += perf_event_count(sub);
4521                 if (read_format & PERF_FORMAT_ID)
4522                         values[n++] = primary_event_id(sub);
4523         }
4524
4525         raw_spin_unlock_irqrestore(&ctx->lock, flags);
4526         return 0;
4527 }
4528
4529 static int perf_read_group(struct perf_event *event,
4530                                    u64 read_format, char __user *buf)
4531 {
4532         struct perf_event *leader = event->group_leader, *child;
4533         struct perf_event_context *ctx = leader->ctx;
4534         int ret;
4535         u64 *values;
4536
4537         lockdep_assert_held(&ctx->mutex);
4538
4539         values = kzalloc(event->read_size, GFP_KERNEL);
4540         if (!values)
4541                 return -ENOMEM;
4542
4543         values[0] = 1 + leader->nr_siblings;
4544
4545         /*
4546          * By locking the child_mutex of the leader we effectively
4547          * lock the child list of all siblings.. XXX explain how.
4548          */
4549         mutex_lock(&leader->child_mutex);
4550
4551         ret = __perf_read_group_add(leader, read_format, values);
4552         if (ret)
4553                 goto unlock;
4554
4555         list_for_each_entry(child, &leader->child_list, child_list) {
4556                 ret = __perf_read_group_add(child, read_format, values);
4557                 if (ret)
4558                         goto unlock;
4559         }
4560
4561         mutex_unlock(&leader->child_mutex);
4562
4563         ret = event->read_size;
4564         if (copy_to_user(buf, values, event->read_size))
4565                 ret = -EFAULT;
4566         goto out;
4567
4568 unlock:
4569         mutex_unlock(&leader->child_mutex);
4570 out:
4571         kfree(values);
4572         return ret;
4573 }
4574
4575 static int perf_read_one(struct perf_event *event,
4576                                  u64 read_format, char __user *buf)
4577 {
4578         u64 enabled, running;
4579         u64 values[4];
4580         int n = 0;
4581
4582         values[n++] = perf_event_read_value(event, &enabled, &running);
4583         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
4584                 values[n++] = enabled;
4585         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
4586                 values[n++] = running;
4587         if (read_format & PERF_FORMAT_ID)
4588                 values[n++] = primary_event_id(event);
4589
4590         if (copy_to_user(buf, values, n * sizeof(u64)))
4591                 return -EFAULT;
4592
4593         return n * sizeof(u64);
4594 }
4595
4596 static bool is_event_hup(struct perf_event *event)
4597 {
4598         bool no_children;
4599
4600         if (event->state > PERF_EVENT_STATE_EXIT)
4601                 return false;
4602
4603         mutex_lock(&event->child_mutex);
4604         no_children = list_empty(&event->child_list);
4605         mutex_unlock(&event->child_mutex);
4606         return no_children;
4607 }
4608
4609 /*
4610  * Read the performance event - simple non blocking version for now
4611  */
4612 static ssize_t
4613 __perf_read(struct perf_event *event, char __user *buf, size_t count)
4614 {
4615         u64 read_format = event->attr.read_format;
4616         int ret;
4617
4618         /*
4619          * Return end-of-file for a read on a event that is in
4620          * error state (i.e. because it was pinned but it couldn't be
4621          * scheduled on to the CPU at some point).
4622          */
4623         if (event->state == PERF_EVENT_STATE_ERROR)
4624                 return 0;
4625
4626         if (count < event->read_size)
4627                 return -ENOSPC;
4628
4629         WARN_ON_ONCE(event->ctx->parent_ctx);
4630         if (read_format & PERF_FORMAT_GROUP)
4631                 ret = perf_read_group(event, read_format, buf);
4632         else
4633                 ret = perf_read_one(event, read_format, buf);
4634
4635         return ret;
4636 }
4637
4638 static ssize_t
4639 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
4640 {
4641         struct perf_event *event = file->private_data;
4642         struct perf_event_context *ctx;
4643         int ret;
4644
4645         ctx = perf_event_ctx_lock(event);
4646         ret = __perf_read(event, buf, count);
4647         perf_event_ctx_unlock(event, ctx);
4648
4649         return ret;
4650 }
4651
4652 static unsigned int perf_poll(struct file *file, poll_table *wait)
4653 {
4654         struct perf_event *event = file->private_data;
4655         struct ring_buffer *rb;
4656         unsigned int events = POLLHUP;
4657
4658         poll_wait(file, &event->waitq, wait);
4659
4660         if (is_event_hup(event))
4661                 return events;
4662
4663         /*
4664          * Pin the event->rb by taking event->mmap_mutex; otherwise
4665          * perf_event_set_output() can swizzle our rb and make us miss wakeups.
4666          */
4667         mutex_lock(&event->mmap_mutex);
4668         rb = event->rb;
4669         if (rb)
4670                 events = atomic_xchg(&rb->poll, 0);
4671         mutex_unlock(&event->mmap_mutex);
4672         return events;
4673 }
4674
4675 static void _perf_event_reset(struct perf_event *event)
4676 {
4677         (void)perf_event_read(event, false);
4678         local64_set(&event->count, 0);
4679         perf_event_update_userpage(event);
4680 }
4681
4682 /*
4683  * Holding the top-level event's child_mutex means that any
4684  * descendant process that has inherited this event will block
4685  * in perf_event_exit_event() if it goes to exit, thus satisfying the
4686  * task existence requirements of perf_event_enable/disable.
4687  */
4688 static void perf_event_for_each_child(struct perf_event *event,
4689                                         void (*func)(struct perf_event *))
4690 {
4691         struct perf_event *child;
4692
4693         WARN_ON_ONCE(event->ctx->parent_ctx);
4694
4695         mutex_lock(&event->child_mutex);
4696         func(event);
4697         list_for_each_entry(child, &event->child_list, child_list)
4698                 func(child);
4699         mutex_unlock(&event->child_mutex);
4700 }
4701
4702 static void perf_event_for_each(struct perf_event *event,
4703                                   void (*func)(struct perf_event *))
4704 {
4705         struct perf_event_context *ctx = event->ctx;
4706         struct perf_event *sibling;
4707
4708         lockdep_assert_held(&ctx->mutex);
4709
4710         event = event->group_leader;
4711
4712         perf_event_for_each_child(event, func);
4713         list_for_each_entry(sibling, &event->sibling_list, group_entry)
4714                 perf_event_for_each_child(sibling, func);
4715 }
4716
4717 static void __perf_event_period(struct perf_event *event,
4718                                 struct perf_cpu_context *cpuctx,
4719                                 struct perf_event_context *ctx,
4720                                 void *info)
4721 {
4722         u64 value = *((u64 *)info);
4723         bool active;
4724
4725         if (event->attr.freq) {
4726                 event->attr.sample_freq = value;
4727         } else {
4728                 event->attr.sample_period = value;
4729                 event->hw.sample_period = value;
4730         }
4731
4732         active = (event->state == PERF_EVENT_STATE_ACTIVE);
4733         if (active) {
4734                 perf_pmu_disable(ctx->pmu);
4735                 /*
4736                  * We could be throttled; unthrottle now to avoid the tick
4737                  * trying to unthrottle while we already re-started the event.
4738                  */
4739                 if (event->hw.interrupts == MAX_INTERRUPTS) {
4740                         event->hw.interrupts = 0;
4741                         perf_log_throttle(event, 1);
4742                 }
4743                 event->pmu->stop(event, PERF_EF_UPDATE);
4744         }
4745
4746         local64_set(&event->hw.period_left, 0);
4747
4748         if (active) {
4749                 event->pmu->start(event, PERF_EF_RELOAD);
4750                 perf_pmu_enable(ctx->pmu);
4751         }
4752 }
4753
4754 static int perf_event_check_period(struct perf_event *event, u64 value)
4755 {
4756         return event->pmu->check_period(event, value);
4757 }
4758
4759 static int perf_event_period(struct perf_event *event, u64 __user *arg)
4760 {
4761         u64 value;
4762
4763         if (!is_sampling_event(event))
4764                 return -EINVAL;
4765
4766         if (copy_from_user(&value, arg, sizeof(value)))
4767                 return -EFAULT;
4768
4769         if (!value)
4770                 return -EINVAL;
4771
4772         if (event->attr.freq && value > sysctl_perf_event_sample_rate)
4773                 return -EINVAL;
4774
4775         if (perf_event_check_period(event, value))
4776                 return -EINVAL;
4777
4778         if (!event->attr.freq && (value & (1ULL << 63)))
4779                 return -EINVAL;
4780
4781         event_function_call(event, __perf_event_period, &value);
4782
4783         return 0;
4784 }
4785
4786 static const struct file_operations perf_fops;
4787
4788 static inline int perf_fget_light(int fd, struct fd *p)
4789 {
4790         struct fd f = fdget(fd);
4791         if (!f.file)
4792                 return -EBADF;
4793
4794         if (f.file->f_op != &perf_fops) {
4795                 fdput(f);
4796                 return -EBADF;
4797         }
4798         *p = f;
4799         return 0;
4800 }
4801
4802 static int perf_event_set_output(struct perf_event *event,
4803                                  struct perf_event *output_event);
4804 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
4805 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
4806
4807 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
4808 {
4809         void (*func)(struct perf_event *);
4810         u32 flags = arg;
4811
4812         switch (cmd) {
4813         case PERF_EVENT_IOC_ENABLE:
4814                 func = _perf_event_enable;
4815                 break;
4816         case PERF_EVENT_IOC_DISABLE:
4817                 func = _perf_event_disable;
4818                 break;
4819         case PERF_EVENT_IOC_RESET:
4820                 func = _perf_event_reset;
4821                 break;
4822
4823         case PERF_EVENT_IOC_REFRESH:
4824                 return _perf_event_refresh(event, arg);
4825
4826         case PERF_EVENT_IOC_PERIOD:
4827                 return perf_event_period(event, (u64 __user *)arg);
4828
4829         case PERF_EVENT_IOC_ID:
4830         {
4831                 u64 id = primary_event_id(event);
4832
4833                 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
4834                         return -EFAULT;
4835                 return 0;
4836         }
4837
4838         case PERF_EVENT_IOC_SET_OUTPUT:
4839         {
4840                 int ret;
4841                 if (arg != -1) {
4842                         struct perf_event *output_event;
4843                         struct fd output;
4844                         ret = perf_fget_light(arg, &output);
4845                         if (ret)
4846                                 return ret;
4847                         output_event = output.file->private_data;
4848                         ret = perf_event_set_output(event, output_event);
4849                         fdput(output);
4850                 } else {
4851                         ret = perf_event_set_output(event, NULL);
4852                 }
4853                 return ret;
4854         }
4855
4856         case PERF_EVENT_IOC_SET_FILTER:
4857                 return perf_event_set_filter(event, (void __user *)arg);
4858
4859         case PERF_EVENT_IOC_SET_BPF:
4860                 return perf_event_set_bpf_prog(event, arg);
4861
4862         case PERF_EVENT_IOC_PAUSE_OUTPUT: {
4863                 struct ring_buffer *rb;
4864
4865                 rcu_read_lock();
4866                 rb = rcu_dereference(event->rb);
4867                 if (!rb || !rb->nr_pages) {
4868                         rcu_read_unlock();
4869                         return -EINVAL;
4870                 }
4871                 rb_toggle_paused(rb, !!arg);
4872                 rcu_read_unlock();
4873                 return 0;
4874         }
4875         default:
4876                 return -ENOTTY;
4877         }
4878
4879         if (flags & PERF_IOC_FLAG_GROUP)
4880                 perf_event_for_each(event, func);
4881         else
4882                 perf_event_for_each_child(event, func);
4883
4884         return 0;
4885 }
4886
4887 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
4888 {
4889         struct perf_event *event = file->private_data;
4890         struct perf_event_context *ctx;
4891         long ret;
4892
4893         ctx = perf_event_ctx_lock(event);
4894         ret = _perf_ioctl(event, cmd, arg);
4895         perf_event_ctx_unlock(event, ctx);
4896
4897         return ret;
4898 }
4899
4900 #ifdef CONFIG_COMPAT
4901 static long perf_compat_ioctl(struct file *file, unsigned int cmd,
4902                                 unsigned long arg)
4903 {
4904         switch (_IOC_NR(cmd)) {
4905         case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
4906         case _IOC_NR(PERF_EVENT_IOC_ID):
4907                 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
4908                 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
4909                         cmd &= ~IOCSIZE_MASK;
4910                         cmd |= sizeof(void *) << IOCSIZE_SHIFT;
4911                 }
4912                 break;
4913         }
4914         return perf_ioctl(file, cmd, arg);
4915 }
4916 #else
4917 # define perf_compat_ioctl NULL
4918 #endif
4919
4920 int perf_event_task_enable(void)
4921 {
4922         struct perf_event_context *ctx;
4923         struct perf_event *event;
4924
4925         mutex_lock(&current->perf_event_mutex);
4926         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4927                 ctx = perf_event_ctx_lock(event);
4928                 perf_event_for_each_child(event, _perf_event_enable);
4929                 perf_event_ctx_unlock(event, ctx);
4930         }
4931         mutex_unlock(&current->perf_event_mutex);
4932
4933         return 0;
4934 }
4935
4936 int perf_event_task_disable(void)
4937 {
4938         struct perf_event_context *ctx;
4939         struct perf_event *event;
4940
4941         mutex_lock(&current->perf_event_mutex);
4942         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4943                 ctx = perf_event_ctx_lock(event);
4944                 perf_event_for_each_child(event, _perf_event_disable);
4945                 perf_event_ctx_unlock(event, ctx);
4946         }
4947         mutex_unlock(&current->perf_event_mutex);
4948
4949         return 0;
4950 }
4951
4952 static int perf_event_index(struct perf_event *event)
4953 {
4954         if (event->hw.state & PERF_HES_STOPPED)
4955                 return 0;
4956
4957         if (event->state != PERF_EVENT_STATE_ACTIVE)
4958                 return 0;
4959
4960         return event->pmu->event_idx(event);
4961 }
4962
4963 static void calc_timer_values(struct perf_event *event,
4964                                 u64 *now,
4965                                 u64 *enabled,
4966                                 u64 *running)
4967 {
4968         u64 ctx_time;
4969
4970         *now = perf_clock();
4971         ctx_time = event->shadow_ctx_time + *now;
4972         *enabled = ctx_time - event->tstamp_enabled;
4973         *running = ctx_time - event->tstamp_running;
4974 }
4975
4976 static void perf_event_init_userpage(struct perf_event *event)
4977 {
4978         struct perf_event_mmap_page *userpg;
4979         struct ring_buffer *rb;
4980
4981         rcu_read_lock();
4982         rb = rcu_dereference(event->rb);
4983         if (!rb)
4984                 goto unlock;
4985
4986         userpg = rb->user_page;
4987
4988         /* Allow new userspace to detect that bit 0 is deprecated */
4989         userpg->cap_bit0_is_deprecated = 1;
4990         userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
4991         userpg->data_offset = PAGE_SIZE;
4992         userpg->data_size = perf_data_size(rb);
4993
4994 unlock:
4995         rcu_read_unlock();
4996 }
4997
4998 void __weak arch_perf_update_userpage(
4999         struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
5000 {
5001 }
5002
5003 /*
5004  * Callers need to ensure there can be no nesting of this function, otherwise
5005  * the seqlock logic goes bad. We can not serialize this because the arch
5006  * code calls this from NMI context.
5007  */
5008 void perf_event_update_userpage(struct perf_event *event)
5009 {
5010         struct perf_event_mmap_page *userpg;
5011         struct ring_buffer *rb;
5012         u64 enabled, running, now;
5013
5014         rcu_read_lock();
5015         rb = rcu_dereference(event->rb);
5016         if (!rb)
5017                 goto unlock;
5018
5019         /*
5020          * compute total_time_enabled, total_time_running
5021          * based on snapshot values taken when the event
5022          * was last scheduled in.
5023          *
5024          * we cannot simply called update_context_time()
5025          * because of locking issue as we can be called in
5026          * NMI context
5027          */
5028         calc_timer_values(event, &now, &enabled, &running);
5029
5030         userpg = rb->user_page;
5031         /*
5032          * Disable preemption so as to not let the corresponding user-space
5033          * spin too long if we get preempted.
5034          */
5035         preempt_disable();
5036         ++userpg->lock;
5037         barrier();
5038         userpg->index = perf_event_index(event);
5039         userpg->offset = perf_event_count(event);
5040         if (userpg->index)
5041                 userpg->offset -= local64_read(&event->hw.prev_count);
5042
5043         userpg->time_enabled = enabled +
5044                         atomic64_read(&event->child_total_time_enabled);
5045
5046         userpg->time_running = running +
5047                         atomic64_read(&event->child_total_time_running);
5048
5049         arch_perf_update_userpage(event, userpg, now);
5050
5051         barrier();
5052         ++userpg->lock;
5053         preempt_enable();
5054 unlock:
5055         rcu_read_unlock();
5056 }
5057
5058 static int perf_mmap_fault(struct vm_fault *vmf)
5059 {
5060         struct perf_event *event = vmf->vma->vm_file->private_data;
5061         struct ring_buffer *rb;
5062         int ret = VM_FAULT_SIGBUS;
5063
5064         if (vmf->flags & FAULT_FLAG_MKWRITE) {
5065                 if (vmf->pgoff == 0)
5066                         ret = 0;
5067                 return ret;
5068         }
5069
5070         rcu_read_lock();
5071         rb = rcu_dereference(event->rb);
5072         if (!rb)
5073                 goto unlock;
5074
5075         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
5076                 goto unlock;
5077
5078         vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
5079         if (!vmf->page)
5080                 goto unlock;
5081
5082         get_page(vmf->page);
5083         vmf->page->mapping = vmf->vma->vm_file->f_mapping;
5084         vmf->page->index   = vmf->pgoff;
5085
5086         ret = 0;
5087 unlock:
5088         rcu_read_unlock();
5089
5090         return ret;
5091 }
5092
5093 static void ring_buffer_attach(struct perf_event *event,
5094                                struct ring_buffer *rb)
5095 {
5096         struct ring_buffer *old_rb = NULL;
5097         unsigned long flags;
5098
5099         if (event->rb) {
5100                 /*
5101                  * Should be impossible, we set this when removing
5102                  * event->rb_entry and wait/clear when adding event->rb_entry.
5103                  */
5104                 WARN_ON_ONCE(event->rcu_pending);
5105
5106                 old_rb = event->rb;
5107                 spin_lock_irqsave(&old_rb->event_lock, flags);
5108                 list_del_rcu(&event->rb_entry);
5109                 spin_unlock_irqrestore(&old_rb->event_lock, flags);
5110
5111                 event->rcu_batches = get_state_synchronize_rcu();
5112                 event->rcu_pending = 1;
5113         }
5114
5115         if (rb) {
5116                 if (event->rcu_pending) {
5117                         cond_synchronize_rcu(event->rcu_batches);
5118                         event->rcu_pending = 0;
5119                 }
5120
5121                 spin_lock_irqsave(&rb->event_lock, flags);
5122                 list_add_rcu(&event->rb_entry, &rb->event_list);
5123                 spin_unlock_irqrestore(&rb->event_lock, flags);
5124         }
5125
5126         /*
5127          * Avoid racing with perf_mmap_close(AUX): stop the event
5128          * before swizzling the event::rb pointer; if it's getting
5129          * unmapped, its aux_mmap_count will be 0 and it won't
5130          * restart. See the comment in __perf_pmu_output_stop().
5131          *
5132          * Data will inevitably be lost when set_output is done in
5133          * mid-air, but then again, whoever does it like this is
5134          * not in for the data anyway.
5135          */
5136         if (has_aux(event))
5137                 perf_event_stop(event, 0);
5138
5139         rcu_assign_pointer(event->rb, rb);
5140
5141         if (old_rb) {
5142                 ring_buffer_put(old_rb);
5143                 /*
5144                  * Since we detached before setting the new rb, so that we
5145                  * could attach the new rb, we could have missed a wakeup.
5146                  * Provide it now.
5147                  */
5148                 wake_up_all(&event->waitq);
5149         }
5150 }
5151
5152 static void ring_buffer_wakeup(struct perf_event *event)
5153 {
5154         struct ring_buffer *rb;
5155
5156         rcu_read_lock();
5157         rb = rcu_dereference(event->rb);
5158         if (rb) {
5159                 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
5160                         wake_up_all(&event->waitq);
5161         }
5162         rcu_read_unlock();
5163 }
5164
5165 struct ring_buffer *ring_buffer_get(struct perf_event *event)
5166 {
5167         struct ring_buffer *rb;
5168
5169         rcu_read_lock();
5170         rb = rcu_dereference(event->rb);
5171         if (rb) {
5172                 if (!atomic_inc_not_zero(&rb->refcount))
5173                         rb = NULL;
5174         }
5175         rcu_read_unlock();
5176
5177         return rb;
5178 }
5179
5180 void ring_buffer_put(struct ring_buffer *rb)
5181 {
5182         if (!atomic_dec_and_test(&rb->refcount))
5183                 return;
5184
5185         WARN_ON_ONCE(!list_empty(&rb->event_list));
5186
5187         call_rcu(&rb->rcu_head, rb_free_rcu);
5188 }
5189
5190 static void perf_mmap_open(struct vm_area_struct *vma)
5191 {
5192         struct perf_event *event = vma->vm_file->private_data;
5193
5194         atomic_inc(&event->mmap_count);
5195         atomic_inc(&event->rb->mmap_count);
5196
5197         if (vma->vm_pgoff)
5198                 atomic_inc(&event->rb->aux_mmap_count);
5199
5200         if (event->pmu->event_mapped)
5201                 event->pmu->event_mapped(event, vma->vm_mm);
5202 }
5203
5204 static void perf_pmu_output_stop(struct perf_event *event);
5205
5206 /*
5207  * A buffer can be mmap()ed multiple times; either directly through the same
5208  * event, or through other events by use of perf_event_set_output().
5209  *
5210  * In order to undo the VM accounting done by perf_mmap() we need to destroy
5211  * the buffer here, where we still have a VM context. This means we need
5212  * to detach all events redirecting to us.
5213  */
5214 static void perf_mmap_close(struct vm_area_struct *vma)
5215 {
5216         struct perf_event *event = vma->vm_file->private_data;
5217         struct ring_buffer *rb = ring_buffer_get(event);
5218         struct user_struct *mmap_user = rb->mmap_user;
5219         int mmap_locked = rb->mmap_locked;
5220         unsigned long size = perf_data_size(rb);
5221         bool detach_rest = false;
5222
5223         if (event->pmu->event_unmapped)
5224                 event->pmu->event_unmapped(event, vma->vm_mm);
5225
5226         /*
5227          * rb->aux_mmap_count will always drop before rb->mmap_count and
5228          * event->mmap_count, so it is ok to use event->mmap_mutex to
5229          * serialize with perf_mmap here.
5230          */
5231         if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
5232             atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
5233                 /*
5234                  * Stop all AUX events that are writing to this buffer,
5235                  * so that we can free its AUX pages and corresponding PMU
5236                  * data. Note that after rb::aux_mmap_count dropped to zero,
5237                  * they won't start any more (see perf_aux_output_begin()).
5238                  */
5239                 perf_pmu_output_stop(event);
5240
5241                 /* now it's safe to free the pages */
5242                 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
5243                 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
5244
5245                 /* this has to be the last one */
5246                 rb_free_aux(rb);
5247                 WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
5248
5249                 mutex_unlock(&event->mmap_mutex);
5250         }
5251
5252         if (atomic_dec_and_test(&rb->mmap_count))
5253                 detach_rest = true;
5254
5255         if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
5256                 goto out_put;
5257
5258         ring_buffer_attach(event, NULL);
5259         mutex_unlock(&event->mmap_mutex);
5260
5261         /* If there's still other mmap()s of this buffer, we're done. */
5262         if (!detach_rest)
5263                 goto out_put;
5264
5265         /*
5266          * No other mmap()s, detach from all other events that might redirect
5267          * into the now unreachable buffer. Somewhat complicated by the
5268          * fact that rb::event_lock otherwise nests inside mmap_mutex.
5269          */
5270 again:
5271         rcu_read_lock();
5272         list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
5273                 if (!atomic_long_inc_not_zero(&event->refcount)) {
5274                         /*
5275                          * This event is en-route to free_event() which will
5276                          * detach it and remove it from the list.
5277                          */
5278                         continue;
5279                 }
5280                 rcu_read_unlock();
5281
5282                 mutex_lock(&event->mmap_mutex);
5283                 /*
5284                  * Check we didn't race with perf_event_set_output() which can
5285                  * swizzle the rb from under us while we were waiting to
5286                  * acquire mmap_mutex.
5287                  *
5288                  * If we find a different rb; ignore this event, a next
5289                  * iteration will no longer find it on the list. We have to
5290                  * still restart the iteration to make sure we're not now
5291                  * iterating the wrong list.
5292                  */
5293                 if (event->rb == rb)
5294                         ring_buffer_attach(event, NULL);
5295
5296                 mutex_unlock(&event->mmap_mutex);
5297                 put_event(event);
5298
5299                 /*
5300                  * Restart the iteration; either we're on the wrong list or
5301                  * destroyed its integrity by doing a deletion.
5302                  */
5303                 goto again;
5304         }
5305         rcu_read_unlock();
5306
5307         /*
5308          * It could be there's still a few 0-ref events on the list; they'll
5309          * get cleaned up by free_event() -- they'll also still have their
5310          * ref on the rb and will free it whenever they are done with it.
5311          *
5312          * Aside from that, this buffer is 'fully' detached and unmapped,
5313          * undo the VM accounting.
5314          */
5315
5316         atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
5317         vma->vm_mm->pinned_vm -= mmap_locked;
5318         free_uid(mmap_user);
5319
5320 out_put:
5321         ring_buffer_put(rb); /* could be last */
5322 }
5323
5324 static const struct vm_operations_struct perf_mmap_vmops = {
5325         .open           = perf_mmap_open,
5326         .close          = perf_mmap_close, /* non mergable */
5327         .fault          = perf_mmap_fault,
5328         .page_mkwrite   = perf_mmap_fault,
5329 };
5330
5331 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
5332 {
5333         struct perf_event *event = file->private_data;
5334         unsigned long user_locked, user_lock_limit;
5335         struct user_struct *user = current_user();
5336         unsigned long locked, lock_limit;
5337         struct ring_buffer *rb = NULL;
5338         unsigned long vma_size;
5339         unsigned long nr_pages;
5340         long user_extra = 0, extra = 0;
5341         int ret = 0, flags = 0;
5342
5343         /*
5344          * Don't allow mmap() of inherited per-task counters. This would
5345          * create a performance issue due to all children writing to the
5346          * same rb.
5347          */
5348         if (event->cpu == -1 && event->attr.inherit)
5349                 return -EINVAL;
5350
5351         if (!(vma->vm_flags & VM_SHARED))
5352                 return -EINVAL;
5353
5354         vma_size = vma->vm_end - vma->vm_start;
5355
5356         if (vma->vm_pgoff == 0) {
5357                 nr_pages = (vma_size / PAGE_SIZE) - 1;
5358         } else {
5359                 /*
5360                  * AUX area mapping: if rb->aux_nr_pages != 0, it's already
5361                  * mapped, all subsequent mappings should have the same size
5362                  * and offset. Must be above the normal perf buffer.
5363                  */
5364                 u64 aux_offset, aux_size;
5365
5366                 if (!event->rb)
5367                         return -EINVAL;
5368
5369                 nr_pages = vma_size / PAGE_SIZE;
5370
5371                 mutex_lock(&event->mmap_mutex);
5372                 ret = -EINVAL;
5373
5374                 rb = event->rb;
5375                 if (!rb)
5376                         goto aux_unlock;
5377
5378                 aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
5379                 aux_size = ACCESS_ONCE(rb->user_page->aux_size);
5380
5381                 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
5382                         goto aux_unlock;
5383
5384                 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
5385                         goto aux_unlock;
5386
5387                 /* already mapped with a different offset */
5388                 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
5389                         goto aux_unlock;
5390
5391                 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
5392                         goto aux_unlock;
5393
5394                 /* already mapped with a different size */
5395                 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
5396                         goto aux_unlock;
5397
5398                 if (!is_power_of_2(nr_pages))
5399                         goto aux_unlock;
5400
5401                 if (!atomic_inc_not_zero(&rb->mmap_count))
5402                         goto aux_unlock;
5403
5404                 if (rb_has_aux(rb)) {
5405                         atomic_inc(&rb->aux_mmap_count);
5406                         ret = 0;
5407                         goto unlock;
5408                 }
5409
5410                 atomic_set(&rb->aux_mmap_count, 1);
5411                 user_extra = nr_pages;
5412
5413                 goto accounting;
5414         }
5415
5416         /*
5417          * If we have rb pages ensure they're a power-of-two number, so we
5418          * can do bitmasks instead of modulo.
5419          */
5420         if (nr_pages != 0 && !is_power_of_2(nr_pages))
5421                 return -EINVAL;
5422
5423         if (vma_size != PAGE_SIZE * (1 + nr_pages))
5424                 return -EINVAL;
5425
5426         WARN_ON_ONCE(event->ctx->parent_ctx);
5427 again:
5428         mutex_lock(&event->mmap_mutex);
5429         if (event->rb) {
5430                 if (event->rb->nr_pages != nr_pages) {
5431                         ret = -EINVAL;
5432                         goto unlock;
5433                 }
5434
5435                 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
5436                         /*
5437                          * Raced against perf_mmap_close(); remove the
5438                          * event and try again.
5439                          */
5440                         ring_buffer_attach(event, NULL);
5441                         mutex_unlock(&event->mmap_mutex);
5442                         goto again;
5443                 }
5444
5445                 goto unlock;
5446         }
5447
5448         user_extra = nr_pages + 1;
5449
5450 accounting:
5451         user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
5452
5453         /*
5454          * Increase the limit linearly with more CPUs:
5455          */
5456         user_lock_limit *= num_online_cpus();
5457
5458         user_locked = atomic_long_read(&user->locked_vm);
5459
5460         /*
5461          * sysctl_perf_event_mlock may have changed, so that
5462          *     user->locked_vm > user_lock_limit
5463          */
5464         if (user_locked > user_lock_limit)
5465                 user_locked = user_lock_limit;
5466         user_locked += user_extra;
5467
5468         if (user_locked > user_lock_limit)
5469                 extra = user_locked - user_lock_limit;
5470
5471         lock_limit = rlimit(RLIMIT_MEMLOCK);
5472         lock_limit >>= PAGE_SHIFT;
5473         locked = vma->vm_mm->pinned_vm + extra;
5474
5475         if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
5476                 !capable(CAP_IPC_LOCK)) {
5477                 ret = -EPERM;
5478                 goto unlock;
5479         }
5480
5481         WARN_ON(!rb && event->rb);
5482
5483         if (vma->vm_flags & VM_WRITE)
5484                 flags |= RING_BUFFER_WRITABLE;
5485
5486         if (!rb) {
5487                 rb = rb_alloc(nr_pages,
5488                               event->attr.watermark ? event->attr.wakeup_watermark : 0,
5489                               event->cpu, flags);
5490
5491                 if (!rb) {
5492                         ret = -ENOMEM;
5493                         goto unlock;
5494                 }
5495
5496                 atomic_set(&rb->mmap_count, 1);
5497                 rb->mmap_user = get_current_user();
5498                 rb->mmap_locked = extra;
5499
5500                 ring_buffer_attach(event, rb);
5501
5502                 perf_event_init_userpage(event);
5503                 perf_event_update_userpage(event);
5504         } else {
5505                 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
5506                                    event->attr.aux_watermark, flags);
5507                 if (!ret)
5508                         rb->aux_mmap_locked = extra;
5509         }
5510
5511 unlock:
5512         if (!ret) {
5513                 atomic_long_add(user_extra, &user->locked_vm);
5514                 vma->vm_mm->pinned_vm += extra;
5515
5516                 atomic_inc(&event->mmap_count);
5517         } else if (rb) {
5518                 atomic_dec(&rb->mmap_count);
5519         }
5520 aux_unlock:
5521         mutex_unlock(&event->mmap_mutex);
5522
5523         /*
5524          * Since pinned accounting is per vm we cannot allow fork() to copy our
5525          * vma.
5526          */
5527         vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
5528         vma->vm_ops = &perf_mmap_vmops;
5529
5530         if (event->pmu->event_mapped)
5531                 event->pmu->event_mapped(event, vma->vm_mm);
5532
5533         return ret;
5534 }
5535
5536 static int perf_fasync(int fd, struct file *filp, int on)
5537 {
5538         struct inode *inode = file_inode(filp);
5539         struct perf_event *event = filp->private_data;
5540         int retval;
5541
5542         inode_lock(inode);
5543         retval = fasync_helper(fd, filp, on, &event->fasync);
5544         inode_unlock(inode);
5545
5546         if (retval < 0)
5547                 return retval;
5548
5549         return 0;
5550 }
5551
5552 static const struct file_operations perf_fops = {
5553         .llseek                 = no_llseek,
5554         .release                = perf_release,
5555         .read                   = perf_read,
5556         .poll                   = perf_poll,
5557         .unlocked_ioctl         = perf_ioctl,
5558         .compat_ioctl           = perf_compat_ioctl,
5559         .mmap                   = perf_mmap,
5560         .fasync                 = perf_fasync,
5561 };
5562
5563 /*
5564  * Perf event wakeup
5565  *
5566  * If there's data, ensure we set the poll() state and publish everything
5567  * to user-space before waking everybody up.
5568  */
5569
5570 static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
5571 {
5572         /* only the parent has fasync state */
5573         if (event->parent)
5574                 event = event->parent;
5575         return &event->fasync;
5576 }
5577
5578 void perf_event_wakeup(struct perf_event *event)
5579 {
5580         ring_buffer_wakeup(event);
5581
5582         if (event->pending_kill) {
5583                 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
5584                 event->pending_kill = 0;
5585         }
5586 }
5587
5588 static void perf_pending_event(struct irq_work *entry)
5589 {
5590         struct perf_event *event = container_of(entry,
5591                         struct perf_event, pending);
5592         int rctx;
5593
5594         rctx = perf_swevent_get_recursion_context();
5595         /*
5596          * If we 'fail' here, that's OK, it means recursion is already disabled
5597          * and we won't recurse 'further'.
5598          */
5599
5600         if (event->pending_disable) {
5601                 event->pending_disable = 0;
5602                 perf_event_disable_local(event);
5603         }
5604
5605         if (event->pending_wakeup) {
5606                 event->pending_wakeup = 0;
5607                 perf_event_wakeup(event);
5608         }
5609
5610         if (rctx >= 0)
5611                 perf_swevent_put_recursion_context(rctx);
5612 }
5613
5614 /*
5615  * We assume there is only KVM supporting the callbacks.
5616  * Later on, we might change it to a list if there is
5617  * another virtualization implementation supporting the callbacks.
5618  */
5619 struct perf_guest_info_callbacks *perf_guest_cbs;
5620
5621 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5622 {
5623         perf_guest_cbs = cbs;
5624         return 0;
5625 }
5626 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
5627
5628 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5629 {
5630         perf_guest_cbs = NULL;
5631         return 0;
5632 }
5633 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
5634
5635 static void
5636 perf_output_sample_regs(struct perf_output_handle *handle,
5637                         struct pt_regs *regs, u64 mask)
5638 {
5639         int bit;
5640         DECLARE_BITMAP(_mask, 64);
5641
5642         bitmap_from_u64(_mask, mask);
5643         for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
5644                 u64 val;
5645
5646                 val = perf_reg_value(regs, bit);
5647                 perf_output_put(handle, val);
5648         }
5649 }
5650
5651 static void perf_sample_regs_user(struct perf_regs *regs_user,
5652                                   struct pt_regs *regs,
5653                                   struct pt_regs *regs_user_copy)
5654 {
5655         if (user_mode(regs)) {
5656                 regs_user->abi = perf_reg_abi(current);
5657                 regs_user->regs = regs;
5658         } else if (!(current->flags & PF_KTHREAD)) {
5659                 perf_get_regs_user(regs_user, regs, regs_user_copy);
5660         } else {
5661                 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
5662                 regs_user->regs = NULL;
5663         }
5664 }
5665
5666 static void perf_sample_regs_intr(struct perf_regs *regs_intr,
5667                                   struct pt_regs *regs)
5668 {
5669         regs_intr->regs = regs;
5670         regs_intr->abi  = perf_reg_abi(current);
5671 }
5672
5673
5674 /*
5675  * Get remaining task size from user stack pointer.
5676  *
5677  * It'd be better to take stack vma map and limit this more
5678  * precisly, but there's no way to get it safely under interrupt,
5679  * so using TASK_SIZE as limit.
5680  */
5681 static u64 perf_ustack_task_size(struct pt_regs *regs)
5682 {
5683         unsigned long addr = perf_user_stack_pointer(regs);
5684
5685         if (!addr || addr >= TASK_SIZE)
5686                 return 0;
5687
5688         return TASK_SIZE - addr;
5689 }
5690
5691 static u16
5692 perf_sample_ustack_size(u16 stack_size, u16 header_size,
5693                         struct pt_regs *regs)
5694 {
5695         u64 task_size;
5696
5697         /* No regs, no stack pointer, no dump. */
5698         if (!regs)
5699                 return 0;
5700
5701         /*
5702          * Check if we fit in with the requested stack size into the:
5703          * - TASK_SIZE
5704          *   If we don't, we limit the size to the TASK_SIZE.
5705          *
5706          * - remaining sample size
5707          *   If we don't, we customize the stack size to
5708          *   fit in to the remaining sample size.
5709          */
5710
5711         task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
5712         stack_size = min(stack_size, (u16) task_size);
5713
5714         /* Current header size plus static size and dynamic size. */
5715         header_size += 2 * sizeof(u64);
5716
5717         /* Do we fit in with the current stack dump size? */
5718         if ((u16) (header_size + stack_size) < header_size) {
5719                 /*
5720                  * If we overflow the maximum size for the sample,
5721                  * we customize the stack dump size to fit in.
5722                  */
5723                 stack_size = USHRT_MAX - header_size - sizeof(u64);
5724                 stack_size = round_up(stack_size, sizeof(u64));
5725         }
5726
5727         return stack_size;
5728 }
5729
5730 static void
5731 perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
5732                           struct pt_regs *regs)
5733 {
5734         /* Case of a kernel thread, nothing to dump */
5735         if (!regs) {
5736                 u64 size = 0;
5737                 perf_output_put(handle, size);
5738         } else {
5739                 unsigned long sp;
5740                 unsigned int rem;
5741                 u64 dyn_size;
5742                 mm_segment_t fs;
5743
5744                 /*
5745                  * We dump:
5746                  * static size
5747                  *   - the size requested by user or the best one we can fit
5748                  *     in to the sample max size
5749                  * data
5750                  *   - user stack dump data
5751                  * dynamic size
5752                  *   - the actual dumped size
5753                  */
5754
5755                 /* Static size. */
5756                 perf_output_put(handle, dump_size);
5757
5758                 /* Data. */
5759                 sp = perf_user_stack_pointer(regs);
5760                 fs = get_fs();
5761                 set_fs(USER_DS);
5762                 rem = __output_copy_user(handle, (void *) sp, dump_size);
5763                 set_fs(fs);
5764                 dyn_size = dump_size - rem;
5765
5766                 perf_output_skip(handle, rem);
5767
5768                 /* Dynamic size. */
5769                 perf_output_put(handle, dyn_size);
5770         }
5771 }
5772
5773 static void __perf_event_header__init_id(struct perf_event_header *header,
5774                                          struct perf_sample_data *data,
5775                                          struct perf_event *event,
5776                                          u64 sample_type)
5777 {
5778         data->type = event->attr.sample_type;
5779         header->size += event->id_header_size;
5780
5781         if (sample_type & PERF_SAMPLE_TID) {
5782                 /* namespace issues */
5783                 data->tid_entry.pid = perf_event_pid(event, current);
5784                 data->tid_entry.tid = perf_event_tid(event, current);
5785         }
5786
5787         if (sample_type & PERF_SAMPLE_TIME)
5788                 data->time = perf_event_clock(event);
5789
5790         if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
5791                 data->id = primary_event_id(event);
5792
5793         if (sample_type & PERF_SAMPLE_STREAM_ID)
5794                 data->stream_id = event->id;
5795
5796         if (sample_type & PERF_SAMPLE_CPU) {
5797                 data->cpu_entry.cpu      = raw_smp_processor_id();
5798                 data->cpu_entry.reserved = 0;
5799         }
5800 }
5801
5802 void perf_event_header__init_id(struct perf_event_header *header,
5803                                 struct perf_sample_data *data,
5804                                 struct perf_event *event)
5805 {
5806         if (event->attr.sample_id_all)
5807                 __perf_event_header__init_id(header, data, event, event->attr.sample_type);
5808 }
5809
5810 static void __perf_event__output_id_sample(struct perf_output_handle *handle,
5811                                            struct perf_sample_data *data)
5812 {
5813         u64 sample_type = data->type;
5814
5815         if (sample_type & PERF_SAMPLE_TID)
5816                 perf_output_put(handle, data->tid_entry);
5817
5818         if (sample_type & PERF_SAMPLE_TIME)
5819                 perf_output_put(handle, data->time);
5820
5821         if (sample_type & PERF_SAMPLE_ID)
5822                 perf_output_put(handle, data->id);
5823
5824         if (sample_type & PERF_SAMPLE_STREAM_ID)
5825                 perf_output_put(handle, data->stream_id);
5826
5827         if (sample_type & PERF_SAMPLE_CPU)
5828                 perf_output_put(handle, data->cpu_entry);
5829
5830         if (sample_type & PERF_SAMPLE_IDENTIFIER)
5831                 perf_output_put(handle, data->id);
5832 }
5833
5834 void perf_event__output_id_sample(struct perf_event *event,
5835                                   struct perf_output_handle *handle,
5836                                   struct perf_sample_data *sample)
5837 {
5838         if (event->attr.sample_id_all)
5839                 __perf_event__output_id_sample(handle, sample);
5840 }
5841
5842 static void perf_output_read_one(struct perf_output_handle *handle,
5843                                  struct perf_event *event,
5844                                  u64 enabled, u64 running)
5845 {
5846         u64 read_format = event->attr.read_format;
5847         u64 values[4];
5848         int n = 0;
5849
5850         values[n++] = perf_event_count(event);
5851         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5852                 values[n++] = enabled +
5853                         atomic64_read(&event->child_total_time_enabled);
5854         }
5855         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5856                 values[n++] = running +
5857                         atomic64_read(&event->child_total_time_running);
5858         }
5859         if (read_format & PERF_FORMAT_ID)
5860                 values[n++] = primary_event_id(event);
5861
5862         __output_copy(handle, values, n * sizeof(u64));
5863 }
5864
5865 static void perf_output_read_group(struct perf_output_handle *handle,
5866                             struct perf_event *event,
5867                             u64 enabled, u64 running)
5868 {
5869         struct perf_event *leader = event->group_leader, *sub;
5870         u64 read_format = event->attr.read_format;
5871         u64 values[5];
5872         int n = 0;
5873
5874         values[n++] = 1 + leader->nr_siblings;
5875
5876         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5877                 values[n++] = enabled;
5878
5879         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5880                 values[n++] = running;
5881
5882         if ((leader != event) &&
5883             (leader->state == PERF_EVENT_STATE_ACTIVE))
5884                 leader->pmu->read(leader);
5885
5886         values[n++] = perf_event_count(leader);
5887         if (read_format & PERF_FORMAT_ID)
5888                 values[n++] = primary_event_id(leader);
5889
5890         __output_copy(handle, values, n * sizeof(u64));
5891
5892         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
5893                 n = 0;
5894
5895                 if ((sub != event) &&
5896                     (sub->state == PERF_EVENT_STATE_ACTIVE))
5897                         sub->pmu->read(sub);
5898
5899                 values[n++] = perf_event_count(sub);
5900                 if (read_format & PERF_FORMAT_ID)
5901                         values[n++] = primary_event_id(sub);
5902
5903                 __output_copy(handle, values, n * sizeof(u64));
5904         }
5905 }
5906
5907 #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
5908                                  PERF_FORMAT_TOTAL_TIME_RUNNING)
5909
5910 /*
5911  * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
5912  *
5913  * The problem is that its both hard and excessively expensive to iterate the
5914  * child list, not to mention that its impossible to IPI the children running
5915  * on another CPU, from interrupt/NMI context.
5916  */
5917 static void perf_output_read(struct perf_output_handle *handle,
5918                              struct perf_event *event)
5919 {
5920         u64 enabled = 0, running = 0, now;
5921         u64 read_format = event->attr.read_format;
5922
5923         /*
5924          * compute total_time_enabled, total_time_running
5925          * based on snapshot values taken when the event
5926          * was last scheduled in.
5927          *
5928          * we cannot simply called update_context_time()
5929          * because of locking issue as we are called in
5930          * NMI context
5931          */
5932         if (read_format & PERF_FORMAT_TOTAL_TIMES)
5933                 calc_timer_values(event, &now, &enabled, &running);
5934
5935         if (event->attr.read_format & PERF_FORMAT_GROUP)
5936                 perf_output_read_group(handle, event, enabled, running);
5937         else
5938                 perf_output_read_one(handle, event, enabled, running);
5939 }
5940
5941 void perf_output_sample(struct perf_output_handle *handle,
5942                         struct perf_event_header *header,
5943                         struct perf_sample_data *data,
5944                         struct perf_event *event)
5945 {
5946         u64 sample_type = data->type;
5947
5948         perf_output_put(handle, *header);
5949
5950         if (sample_type & PERF_SAMPLE_IDENTIFIER)
5951                 perf_output_put(handle, data->id);
5952
5953         if (sample_type & PERF_SAMPLE_IP)
5954                 perf_output_put(handle, data->ip);
5955
5956         if (sample_type & PERF_SAMPLE_TID)
5957                 perf_output_put(handle, data->tid_entry);
5958
5959         if (sample_type & PERF_SAMPLE_TIME)
5960                 perf_output_put(handle, data->time);
5961
5962         if (sample_type & PERF_SAMPLE_ADDR)
5963                 perf_output_put(handle, data->addr);
5964
5965         if (sample_type & PERF_SAMPLE_ID)
5966                 perf_output_put(handle, data->id);
5967
5968         if (sample_type & PERF_SAMPLE_STREAM_ID)
5969                 perf_output_put(handle, data->stream_id);
5970
5971         if (sample_type & PERF_SAMPLE_CPU)
5972                 perf_output_put(handle, data->cpu_entry);
5973
5974         if (sample_type & PERF_SAMPLE_PERIOD)
5975                 perf_output_put(handle, data->period);
5976
5977         if (sample_type & PERF_SAMPLE_READ)
5978                 perf_output_read(handle, event);
5979
5980         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5981                 if (data->callchain) {
5982                         int size = 1;
5983
5984                         if (data->callchain)
5985                                 size += data->callchain->nr;
5986
5987                         size *= sizeof(u64);
5988
5989                         __output_copy(handle, data->callchain, size);
5990                 } else {
5991                         u64 nr = 0;
5992                         perf_output_put(handle, nr);
5993                 }
5994         }
5995
5996         if (sample_type & PERF_SAMPLE_RAW) {
5997                 struct perf_raw_record *raw = data->raw;
5998
5999                 if (raw) {
6000                         struct perf_raw_frag *frag = &raw->frag;
6001
6002                         perf_output_put(handle, raw->size);
6003                         do {
6004                                 if (frag->copy) {
6005                                         __output_custom(handle, frag->copy,
6006                                                         frag->data, frag->size);
6007                                 } else {
6008                                         __output_copy(handle, frag->data,
6009                                                       frag->size);
6010                                 }
6011                                 if (perf_raw_frag_last(frag))
6012                                         break;
6013                                 frag = frag->next;
6014                         } while (1);
6015                         if (frag->pad)
6016                                 __output_skip(handle, NULL, frag->pad);
6017                 } else {
6018                         struct {
6019                                 u32     size;
6020                                 u32     data;
6021                         } raw = {
6022                                 .size = sizeof(u32),
6023                                 .data = 0,
6024                         };
6025                         perf_output_put(handle, raw);
6026                 }
6027         }
6028
6029         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
6030                 if (data->br_stack) {
6031                         size_t size;
6032
6033                         size = data->br_stack->nr
6034                              * sizeof(struct perf_branch_entry);
6035
6036                         perf_output_put(handle, data->br_stack->nr);
6037                         perf_output_copy(handle, data->br_stack->entries, size);
6038                 } else {
6039                         /*
6040                          * we always store at least the value of nr
6041                          */
6042                         u64 nr = 0;
6043                         perf_output_put(handle, nr);
6044                 }
6045         }
6046
6047         if (sample_type & PERF_SAMPLE_REGS_USER) {
6048                 u64 abi = data->regs_user.abi;
6049
6050                 /*
6051                  * If there are no regs to dump, notice it through
6052                  * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
6053                  */
6054                 perf_output_put(handle, abi);
6055
6056                 if (abi) {
6057                         u64 mask = event->attr.sample_regs_user;
6058                         perf_output_sample_regs(handle,
6059                                                 data->regs_user.regs,
6060                                                 mask);
6061                 }
6062         }
6063
6064         if (sample_type & PERF_SAMPLE_STACK_USER) {
6065                 perf_output_sample_ustack(handle,
6066                                           data->stack_user_size,
6067                                           data->regs_user.regs);
6068         }
6069
6070         if (sample_type & PERF_SAMPLE_WEIGHT)
6071                 perf_output_put(handle, data->weight);
6072
6073         if (sample_type & PERF_SAMPLE_DATA_SRC)
6074                 perf_output_put(handle, data->data_src.val);
6075
6076         if (sample_type & PERF_SAMPLE_TRANSACTION)
6077                 perf_output_put(handle, data->txn);
6078
6079         if (sample_type & PERF_SAMPLE_REGS_INTR) {
6080                 u64 abi = data->regs_intr.abi;
6081                 /*
6082                  * If there are no regs to dump, notice it through
6083                  * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
6084                  */
6085                 perf_output_put(handle, abi);
6086
6087                 if (abi) {
6088                         u64 mask = event->attr.sample_regs_intr;
6089
6090                         perf_output_sample_regs(handle,
6091                                                 data->regs_intr.regs,
6092                                                 mask);
6093                 }
6094         }
6095
6096         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
6097                 perf_output_put(handle, data->phys_addr);
6098
6099         if (!event->attr.watermark) {
6100                 int wakeup_events = event->attr.wakeup_events;
6101
6102                 if (wakeup_events) {
6103                         struct ring_buffer *rb = handle->rb;
6104                         int events = local_inc_return(&rb->events);
6105
6106                         if (events >= wakeup_events) {
6107                                 local_sub(wakeup_events, &rb->events);
6108                                 local_inc(&rb->wakeup);
6109                         }
6110                 }
6111         }
6112 }
6113
6114 static u64 perf_virt_to_phys(u64 virt)
6115 {
6116         u64 phys_addr = 0;
6117
6118         if (!virt)
6119                 return 0;
6120
6121         if (virt >= TASK_SIZE) {
6122                 /* If it's vmalloc()d memory, leave phys_addr as 0 */
6123                 if (virt_addr_valid((void *)(uintptr_t)virt) &&
6124                     !(virt >= VMALLOC_START && virt < VMALLOC_END))
6125                         phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
6126         } else {
6127                 /*
6128                  * Walking the pages tables for user address.
6129                  * Interrupts are disabled, so it prevents any tear down
6130                  * of the page tables.
6131                  * Try IRQ-safe __get_user_pages_fast first.
6132                  * If failed, leave phys_addr as 0.
6133                  */
6134                 if (current->mm != NULL) {
6135                         struct page *p;
6136
6137                         pagefault_disable();
6138                         if (__get_user_pages_fast(virt, 1, 0, &p) == 1) {
6139                                 phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
6140                                 put_page(p);
6141                         }
6142                         pagefault_enable();
6143                 }
6144         }
6145
6146         return phys_addr;
6147 }
6148
6149 void perf_prepare_sample(struct perf_event_header *header,
6150                          struct perf_sample_data *data,
6151                          struct perf_event *event,
6152                          struct pt_regs *regs)
6153 {
6154         u64 sample_type = event->attr.sample_type;
6155         u64 filtered_sample_type;
6156
6157         header->type = PERF_RECORD_SAMPLE;
6158         header->size = sizeof(*header) + event->header_size;
6159
6160         header->misc = 0;
6161         header->misc |= perf_misc_flags(regs);
6162
6163         /*
6164          * Clear the sample flags that have already been done by the
6165          * PMU driver.
6166          */
6167         filtered_sample_type = sample_type & ~data->sample_flags;
6168         __perf_event_header__init_id(header, data, event, filtered_sample_type);
6169
6170         if (sample_type & PERF_SAMPLE_IP)
6171                 data->ip = perf_instruction_pointer(regs);
6172
6173         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
6174                 int size = 1;
6175
6176                 data->callchain = perf_callchain(event, regs);
6177
6178                 if (data->callchain)
6179                         size += data->callchain->nr;
6180
6181                 header->size += size * sizeof(u64);
6182         }
6183
6184         if (sample_type & PERF_SAMPLE_RAW) {
6185                 struct perf_raw_record *raw = data->raw;
6186                 int size;
6187
6188                 if (raw) {
6189                         struct perf_raw_frag *frag = &raw->frag;
6190                         u32 sum = 0;
6191
6192                         do {
6193                                 sum += frag->size;
6194                                 if (perf_raw_frag_last(frag))
6195                                         break;
6196                                 frag = frag->next;
6197                         } while (1);
6198
6199                         size = round_up(sum + sizeof(u32), sizeof(u64));
6200                         raw->size = size - sizeof(u32);
6201                         frag->pad = raw->size - sum;
6202                 } else {
6203                         size = sizeof(u64);
6204                 }
6205
6206                 header->size += size;
6207         }
6208
6209         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
6210                 int size = sizeof(u64); /* nr */
6211                 if (data->br_stack) {
6212                         size += data->br_stack->nr
6213                               * sizeof(struct perf_branch_entry);
6214                 }
6215                 header->size += size;
6216         }
6217
6218         if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
6219                 perf_sample_regs_user(&data->regs_user, regs,
6220                                       &data->regs_user_copy);
6221
6222         if (sample_type & PERF_SAMPLE_REGS_USER) {
6223                 /* regs dump ABI info */
6224                 int size = sizeof(u64);
6225
6226                 if (data->regs_user.regs) {
6227                         u64 mask = event->attr.sample_regs_user;
6228                         size += hweight64(mask) * sizeof(u64);
6229                 }
6230
6231                 header->size += size;
6232         }
6233
6234         if (sample_type & PERF_SAMPLE_STACK_USER) {
6235                 /*
6236                  * Either we need PERF_SAMPLE_STACK_USER bit to be allways
6237                  * processed as the last one or have additional check added
6238                  * in case new sample type is added, because we could eat
6239                  * up the rest of the sample size.
6240                  */
6241                 u16 stack_size = event->attr.sample_stack_user;
6242                 u16 size = sizeof(u64);
6243
6244                 stack_size = perf_sample_ustack_size(stack_size, header->size,
6245                                                      data->regs_user.regs);
6246
6247                 /*
6248                  * If there is something to dump, add space for the dump
6249                  * itself and for the field that tells the dynamic size,
6250                  * which is how many have been actually dumped.
6251                  */
6252                 if (stack_size)
6253                         size += sizeof(u64) + stack_size;
6254
6255                 data->stack_user_size = stack_size;
6256                 header->size += size;
6257         }
6258
6259         if (sample_type & PERF_SAMPLE_REGS_INTR) {
6260                 /* regs dump ABI info */
6261                 int size = sizeof(u64);
6262
6263                 perf_sample_regs_intr(&data->regs_intr, regs);
6264
6265                 if (data->regs_intr.regs) {
6266                         u64 mask = event->attr.sample_regs_intr;
6267
6268                         size += hweight64(mask) * sizeof(u64);
6269                 }
6270
6271                 header->size += size;
6272         }
6273
6274         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
6275                 data->phys_addr = perf_virt_to_phys(data->addr);
6276 }
6277
6278 static void __always_inline
6279 __perf_event_output(struct perf_event *event,
6280                     struct perf_sample_data *data,
6281                     struct pt_regs *regs,
6282                     int (*output_begin)(struct perf_output_handle *,
6283                                         struct perf_event *,
6284                                         unsigned int))
6285 {
6286         struct perf_output_handle handle;
6287         struct perf_event_header header;
6288
6289         /* protect the callchain buffers */
6290         rcu_read_lock();
6291
6292         perf_prepare_sample(&header, data, event, regs);
6293
6294         if (output_begin(&handle, event, header.size))
6295                 goto exit;
6296
6297         perf_output_sample(&handle, &header, data, event);
6298
6299         perf_output_end(&handle);
6300
6301 exit:
6302         rcu_read_unlock();
6303 }
6304
6305 void
6306 perf_event_output_forward(struct perf_event *event,
6307                          struct perf_sample_data *data,
6308                          struct pt_regs *regs)
6309 {
6310         __perf_event_output(event, data, regs, perf_output_begin_forward);
6311 }
6312
6313 void
6314 perf_event_output_backward(struct perf_event *event,
6315                            struct perf_sample_data *data,
6316                            struct pt_regs *regs)
6317 {
6318         __perf_event_output(event, data, regs, perf_output_begin_backward);
6319 }
6320
6321 void
6322 perf_event_output(struct perf_event *event,
6323                   struct perf_sample_data *data,
6324                   struct pt_regs *regs)
6325 {
6326         __perf_event_output(event, data, regs, perf_output_begin);
6327 }
6328
6329 /*
6330  * read event_id
6331  */
6332
6333 struct perf_read_event {
6334         struct perf_event_header        header;
6335
6336         u32                             pid;
6337         u32                             tid;
6338 };
6339
6340 static void
6341 perf_event_read_event(struct perf_event *event,
6342                         struct task_struct *task)
6343 {
6344         struct perf_output_handle handle;
6345         struct perf_sample_data sample;
6346         struct perf_read_event read_event = {
6347                 .header = {
6348                         .type = PERF_RECORD_READ,
6349                         .misc = 0,
6350                         .size = sizeof(read_event) + event->read_size,
6351                 },
6352                 .pid = perf_event_pid(event, task),
6353                 .tid = perf_event_tid(event, task),
6354         };
6355         int ret;
6356
6357         perf_event_header__init_id(&read_event.header, &sample, event);
6358         ret = perf_output_begin(&handle, event, read_event.header.size);
6359         if (ret)
6360                 return;
6361
6362         perf_output_put(&handle, read_event);
6363         perf_output_read(&handle, event);
6364         perf_event__output_id_sample(event, &handle, &sample);
6365
6366         perf_output_end(&handle);
6367 }
6368
6369 typedef void (perf_iterate_f)(struct perf_event *event, void *data);
6370
6371 static void
6372 perf_iterate_ctx(struct perf_event_context *ctx,
6373                    perf_iterate_f output,
6374                    void *data, bool all)
6375 {
6376         struct perf_event *event;
6377
6378         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
6379                 if (!all) {
6380                         if (event->state < PERF_EVENT_STATE_INACTIVE)
6381                                 continue;
6382                         if (!event_filter_match(event))
6383                                 continue;
6384                 }
6385
6386                 output(event, data);
6387         }
6388 }
6389
6390 static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
6391 {
6392         struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
6393         struct perf_event *event;
6394
6395         list_for_each_entry_rcu(event, &pel->list, sb_list) {
6396                 /*
6397                  * Skip events that are not fully formed yet; ensure that
6398                  * if we observe event->ctx, both event and ctx will be
6399                  * complete enough. See perf_install_in_context().
6400                  */
6401                 if (!smp_load_acquire(&event->ctx))
6402                         continue;
6403
6404                 if (event->state < PERF_EVENT_STATE_INACTIVE)
6405                         continue;
6406                 if (!event_filter_match(event))
6407                         continue;
6408                 output(event, data);
6409         }
6410 }
6411
6412 /*
6413  * Iterate all events that need to receive side-band events.
6414  *
6415  * For new callers; ensure that account_pmu_sb_event() includes
6416  * your event, otherwise it might not get delivered.
6417  */
6418 static void
6419 perf_iterate_sb(perf_iterate_f output, void *data,
6420                struct perf_event_context *task_ctx)
6421 {
6422         struct perf_event_context *ctx;
6423         int ctxn;
6424
6425         rcu_read_lock();
6426         preempt_disable();
6427
6428         /*
6429          * If we have task_ctx != NULL we only notify the task context itself.
6430          * The task_ctx is set only for EXIT events before releasing task
6431          * context.
6432          */
6433         if (task_ctx) {
6434                 perf_iterate_ctx(task_ctx, output, data, false);
6435                 goto done;
6436         }
6437
6438         perf_iterate_sb_cpu(output, data);
6439
6440         for_each_task_context_nr(ctxn) {
6441                 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
6442                 if (ctx)
6443                         perf_iterate_ctx(ctx, output, data, false);
6444         }
6445 done:
6446         preempt_enable();
6447         rcu_read_unlock();
6448 }
6449
6450 /*
6451  * Clear all file-based filters at exec, they'll have to be
6452  * re-instated when/if these objects are mmapped again.
6453  */
6454 static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
6455 {
6456         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
6457         struct perf_addr_filter *filter;
6458         unsigned int restart = 0, count = 0;
6459         unsigned long flags;
6460
6461         if (!has_addr_filter(event))
6462                 return;
6463
6464         raw_spin_lock_irqsave(&ifh->lock, flags);
6465         list_for_each_entry(filter, &ifh->list, entry) {
6466                 if (filter->path.dentry) {
6467                         event->addr_filters_offs[count] = 0;
6468                         restart++;
6469                 }
6470
6471                 count++;
6472         }
6473
6474         if (restart)
6475                 event->addr_filters_gen++;
6476         raw_spin_unlock_irqrestore(&ifh->lock, flags);
6477
6478         if (restart)
6479                 perf_event_stop(event, 1);
6480 }
6481
6482 void perf_event_exec(void)
6483 {
6484         struct perf_event_context *ctx;
6485         int ctxn;
6486
6487         rcu_read_lock();
6488         for_each_task_context_nr(ctxn) {
6489                 ctx = current->perf_event_ctxp[ctxn];
6490                 if (!ctx)
6491                         continue;
6492
6493                 perf_event_enable_on_exec(ctxn);
6494
6495                 perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
6496                                    true);
6497         }
6498         rcu_read_unlock();
6499 }
6500
6501 struct remote_output {
6502         struct ring_buffer      *rb;
6503         int                     err;
6504 };
6505
6506 static void __perf_event_output_stop(struct perf_event *event, void *data)
6507 {
6508         struct perf_event *parent = event->parent;
6509         struct remote_output *ro = data;
6510         struct ring_buffer *rb = ro->rb;
6511         struct stop_event_data sd = {
6512                 .event  = event,
6513         };
6514
6515         if (!has_aux(event))
6516                 return;
6517
6518         if (!parent)
6519                 parent = event;
6520
6521         /*
6522          * In case of inheritance, it will be the parent that links to the
6523          * ring-buffer, but it will be the child that's actually using it.
6524          *
6525          * We are using event::rb to determine if the event should be stopped,
6526          * however this may race with ring_buffer_attach() (through set_output),
6527          * which will make us skip the event that actually needs to be stopped.
6528          * So ring_buffer_attach() has to stop an aux event before re-assigning
6529          * its rb pointer.
6530          */
6531         if (rcu_dereference(parent->rb) == rb)
6532                 ro->err = __perf_event_stop(&sd);
6533 }
6534
6535 static int __perf_pmu_output_stop(void *info)
6536 {
6537         struct perf_event *event = info;
6538         struct pmu *pmu = event->pmu;
6539         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
6540         struct remote_output ro = {
6541                 .rb     = event->rb,
6542         };
6543
6544         rcu_read_lock();
6545         perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
6546         if (cpuctx->task_ctx)
6547                 perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
6548                                    &ro, false);
6549         rcu_read_unlock();
6550
6551         return ro.err;
6552 }
6553
6554 static void perf_pmu_output_stop(struct perf_event *event)
6555 {
6556         struct perf_event *iter;
6557         int err, cpu;
6558
6559 restart:
6560         rcu_read_lock();
6561         list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
6562                 /*
6563                  * For per-CPU events, we need to make sure that neither they
6564                  * nor their children are running; for cpu==-1 events it's
6565                  * sufficient to stop the event itself if it's active, since
6566                  * it can't have children.
6567                  */
6568                 cpu = iter->cpu;
6569                 if (cpu == -1)
6570                         cpu = READ_ONCE(iter->oncpu);
6571
6572                 if (cpu == -1)
6573                         continue;
6574
6575                 err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
6576                 if (err == -EAGAIN) {
6577                         rcu_read_unlock();
6578                         goto restart;
6579                 }
6580         }
6581         rcu_read_unlock();
6582 }
6583
6584 /*
6585  * task tracking -- fork/exit
6586  *
6587  * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
6588  */
6589
6590 struct perf_task_event {
6591         struct task_struct              *task;
6592         struct perf_event_context       *task_ctx;
6593
6594         struct {
6595                 struct perf_event_header        header;
6596
6597                 u32                             pid;
6598                 u32                             ppid;
6599                 u32                             tid;
6600                 u32                             ptid;
6601                 u64                             time;
6602         } event_id;
6603 };
6604
6605 static int perf_event_task_match(struct perf_event *event)
6606 {
6607         return event->attr.comm  || event->attr.mmap ||
6608                event->attr.mmap2 || event->attr.mmap_data ||
6609                event->attr.task;
6610 }
6611
6612 static void perf_event_task_output(struct perf_event *event,
6613                                    void *data)
6614 {
6615         struct perf_task_event *task_event = data;
6616         struct perf_output_handle handle;
6617         struct perf_sample_data sample;
6618         struct task_struct *task = task_event->task;
6619         int ret, size = task_event->event_id.header.size;
6620
6621         if (!perf_event_task_match(event))
6622                 return;
6623
6624         perf_event_header__init_id(&task_event->event_id.header, &sample, event);
6625
6626         ret = perf_output_begin(&handle, event,
6627                                 task_event->event_id.header.size);
6628         if (ret)
6629                 goto out;
6630
6631         task_event->event_id.pid = perf_event_pid(event, task);
6632         task_event->event_id.tid = perf_event_tid(event, task);
6633
6634         if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
6635                 task_event->event_id.ppid = perf_event_pid(event,
6636                                                         task->real_parent);
6637                 task_event->event_id.ptid = perf_event_pid(event,
6638                                                         task->real_parent);
6639         } else {  /* PERF_RECORD_FORK */
6640                 task_event->event_id.ppid = perf_event_pid(event, current);
6641                 task_event->event_id.ptid = perf_event_tid(event, current);
6642         }
6643
6644         task_event->event_id.time = perf_event_clock(event);
6645
6646         perf_output_put(&handle, task_event->event_id);
6647
6648         perf_event__output_id_sample(event, &handle, &sample);
6649
6650         perf_output_end(&handle);
6651 out:
6652         task_event->event_id.header.size = size;
6653 }
6654
6655 static void perf_event_task(struct task_struct *task,
6656                               struct perf_event_context *task_ctx,
6657                               int new)
6658 {
6659         struct perf_task_event task_event;
6660
6661         if (!atomic_read(&nr_comm_events) &&
6662             !atomic_read(&nr_mmap_events) &&
6663             !atomic_read(&nr_task_events))
6664                 return;
6665
6666         task_event = (struct perf_task_event){
6667                 .task     = task,
6668                 .task_ctx = task_ctx,
6669                 .event_id    = {
6670                         .header = {
6671                                 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
6672                                 .misc = 0,
6673                                 .size = sizeof(task_event.event_id),
6674                         },
6675                         /* .pid  */
6676                         /* .ppid */
6677                         /* .tid  */
6678                         /* .ptid */
6679                         /* .time */
6680                 },
6681         };
6682
6683         perf_iterate_sb(perf_event_task_output,
6684                        &task_event,
6685                        task_ctx);
6686 }
6687
6688 void perf_event_fork(struct task_struct *task)
6689 {
6690         perf_event_task(task, NULL, 1);
6691         perf_event_namespaces(task);
6692 }
6693
6694 /*
6695  * comm tracking
6696  */
6697
6698 struct perf_comm_event {
6699         struct task_struct      *task;
6700         char                    *comm;
6701         int                     comm_size;
6702
6703         struct {
6704                 struct perf_event_header        header;
6705
6706                 u32                             pid;
6707                 u32                             tid;
6708         } event_id;
6709 };
6710
6711 static int perf_event_comm_match(struct perf_event *event)
6712 {
6713         return event->attr.comm;
6714 }
6715
6716 static void perf_event_comm_output(struct perf_event *event,
6717                                    void *data)
6718 {
6719         struct perf_comm_event *comm_event = data;
6720         struct perf_output_handle handle;
6721         struct perf_sample_data sample;
6722         int size = comm_event->event_id.header.size;
6723         int ret;
6724
6725         if (!perf_event_comm_match(event))
6726                 return;
6727
6728         perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
6729         ret = perf_output_begin(&handle, event,
6730                                 comm_event->event_id.header.size);
6731
6732         if (ret)
6733                 goto out;
6734
6735         comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
6736         comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
6737
6738         perf_output_put(&handle, comm_event->event_id);
6739         __output_copy(&handle, comm_event->comm,
6740                                    comm_event->comm_size);
6741
6742         perf_event__output_id_sample(event, &handle, &sample);
6743
6744         perf_output_end(&handle);
6745 out:
6746         comm_event->event_id.header.size = size;
6747 }
6748
6749 static void perf_event_comm_event(struct perf_comm_event *comm_event)
6750 {
6751         char comm[TASK_COMM_LEN];
6752         unsigned int size;
6753
6754         memset(comm, 0, sizeof(comm));
6755         strlcpy(comm, comm_event->task->comm, sizeof(comm));
6756         size = ALIGN(strlen(comm)+1, sizeof(u64));
6757
6758         comm_event->comm = comm;
6759         comm_event->comm_size = size;
6760
6761         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
6762
6763         perf_iterate_sb(perf_event_comm_output,
6764                        comm_event,
6765                        NULL);
6766 }
6767
6768 void perf_event_comm(struct task_struct *task, bool exec)
6769 {
6770         struct perf_comm_event comm_event;
6771
6772         if (!atomic_read(&nr_comm_events))
6773                 return;
6774
6775         comm_event = (struct perf_comm_event){
6776                 .task   = task,
6777                 /* .comm      */
6778                 /* .comm_size */
6779                 .event_id  = {
6780                         .header = {
6781                                 .type = PERF_RECORD_COMM,
6782                                 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
6783                                 /* .size */
6784                         },
6785                         /* .pid */
6786                         /* .tid */
6787                 },
6788         };
6789
6790         perf_event_comm_event(&comm_event);
6791 }
6792
6793 /*
6794  * namespaces tracking
6795  */
6796
6797 struct perf_namespaces_event {
6798         struct task_struct              *task;
6799
6800         struct {
6801                 struct perf_event_header        header;
6802
6803                 u32                             pid;
6804                 u32                             tid;
6805                 u64                             nr_namespaces;
6806                 struct perf_ns_link_info        link_info[NR_NAMESPACES];
6807         } event_id;
6808 };
6809
6810 static int perf_event_namespaces_match(struct perf_event *event)
6811 {
6812         return event->attr.namespaces;
6813 }
6814
6815 static void perf_event_namespaces_output(struct perf_event *event,
6816                                          void *data)
6817 {
6818         struct perf_namespaces_event *namespaces_event = data;
6819         struct perf_output_handle handle;
6820         struct perf_sample_data sample;
6821         u16 header_size = namespaces_event->event_id.header.size;
6822         int ret;
6823
6824         if (!perf_event_namespaces_match(event))
6825                 return;
6826
6827         perf_event_header__init_id(&namespaces_event->event_id.header,
6828                                    &sample, event);
6829         ret = perf_output_begin(&handle, event,
6830                                 namespaces_event->event_id.header.size);
6831         if (ret)
6832                 goto out;
6833
6834         namespaces_event->event_id.pid = perf_event_pid(event,
6835                                                         namespaces_event->task);
6836         namespaces_event->event_id.tid = perf_event_tid(event,
6837                                                         namespaces_event->task);
6838
6839         perf_output_put(&handle, namespaces_event->event_id);
6840
6841         perf_event__output_id_sample(event, &handle, &sample);
6842
6843         perf_output_end(&handle);
6844 out:
6845         namespaces_event->event_id.header.size = header_size;
6846 }
6847
6848 static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
6849                                    struct task_struct *task,
6850                                    const struct proc_ns_operations *ns_ops)
6851 {
6852         struct path ns_path;
6853         struct inode *ns_inode;
6854         void *error;
6855
6856         error = ns_get_path(&ns_path, task, ns_ops);
6857         if (!error) {
6858                 ns_inode = ns_path.dentry->d_inode;
6859                 ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
6860                 ns_link_info->ino = ns_inode->i_ino;
6861                 path_put(&ns_path);
6862         }
6863 }
6864
6865 void perf_event_namespaces(struct task_struct *task)
6866 {
6867         struct perf_namespaces_event namespaces_event;
6868         struct perf_ns_link_info *ns_link_info;
6869
6870         if (!atomic_read(&nr_namespaces_events))
6871                 return;
6872
6873         namespaces_event = (struct perf_namespaces_event){
6874                 .task   = task,
6875                 .event_id  = {
6876                         .header = {
6877                                 .type = PERF_RECORD_NAMESPACES,
6878                                 .misc = 0,
6879                                 .size = sizeof(namespaces_event.event_id),
6880                         },
6881                         /* .pid */
6882                         /* .tid */
6883                         .nr_namespaces = NR_NAMESPACES,
6884                         /* .link_info[NR_NAMESPACES] */
6885                 },
6886         };
6887
6888         ns_link_info = namespaces_event.event_id.link_info;
6889
6890         perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
6891                                task, &mntns_operations);
6892
6893 #ifdef CONFIG_USER_NS
6894         perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
6895                                task, &userns_operations);
6896 #endif
6897 #ifdef CONFIG_NET_NS
6898         perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
6899                                task, &netns_operations);
6900 #endif
6901 #ifdef CONFIG_UTS_NS
6902         perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
6903                                task, &utsns_operations);
6904 #endif
6905 #ifdef CONFIG_IPC_NS
6906         perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
6907                                task, &ipcns_operations);
6908 #endif
6909 #ifdef CONFIG_PID_NS
6910         perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
6911                                task, &pidns_operations);
6912 #endif
6913 #ifdef CONFIG_CGROUPS
6914         perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
6915                                task, &cgroupns_operations);
6916 #endif
6917
6918         perf_iterate_sb(perf_event_namespaces_output,
6919                         &namespaces_event,
6920                         NULL);
6921 }
6922
6923 /*
6924  * mmap tracking
6925  */
6926
6927 struct perf_mmap_event {
6928         struct vm_area_struct   *vma;
6929
6930         const char              *file_name;
6931         int                     file_size;
6932         int                     maj, min;
6933         u64                     ino;
6934         u64                     ino_generation;
6935         u32                     prot, flags;
6936
6937         struct {
6938                 struct perf_event_header        header;
6939
6940                 u32                             pid;
6941                 u32                             tid;
6942                 u64                             start;
6943                 u64                             len;
6944                 u64                             pgoff;
6945         } event_id;
6946 };
6947
6948 static int perf_event_mmap_match(struct perf_event *event,
6949                                  void *data)
6950 {
6951         struct perf_mmap_event *mmap_event = data;
6952         struct vm_area_struct *vma = mmap_event->vma;
6953         int executable = vma->vm_flags & VM_EXEC;
6954
6955         return (!executable && event->attr.mmap_data) ||
6956                (executable && (event->attr.mmap || event->attr.mmap2));
6957 }
6958
6959 static void perf_event_mmap_output(struct perf_event *event,
6960                                    void *data)
6961 {
6962         struct perf_mmap_event *mmap_event = data;
6963         struct perf_output_handle handle;
6964         struct perf_sample_data sample;
6965         int size = mmap_event->event_id.header.size;
6966         u32 type = mmap_event->event_id.header.type;
6967         int ret;
6968
6969         if (!perf_event_mmap_match(event, data))
6970                 return;
6971
6972         if (event->attr.mmap2) {
6973                 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
6974                 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
6975                 mmap_event->event_id.header.size += sizeof(mmap_event->min);
6976                 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
6977                 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
6978                 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
6979                 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
6980         }
6981
6982         perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
6983         ret = perf_output_begin(&handle, event,
6984                                 mmap_event->event_id.header.size);
6985         if (ret)
6986                 goto out;
6987
6988         mmap_event->event_id.pid = perf_event_pid(event, current);
6989         mmap_event->event_id.tid = perf_event_tid(event, current);
6990
6991         perf_output_put(&handle, mmap_event->event_id);
6992
6993         if (event->attr.mmap2) {
6994                 perf_output_put(&handle, mmap_event->maj);
6995                 perf_output_put(&handle, mmap_event->min);
6996                 perf_output_put(&handle, mmap_event->ino);
6997                 perf_output_put(&handle, mmap_event->ino_generation);
6998                 perf_output_put(&handle, mmap_event->prot);
6999                 perf_output_put(&handle, mmap_event->flags);
7000         }
7001
7002         __output_copy(&handle, mmap_event->file_name,
7003                                    mmap_event->file_size);
7004
7005         perf_event__output_id_sample(event, &handle, &sample);
7006
7007         perf_output_end(&handle);
7008 out:
7009         mmap_event->event_id.header.size = size;
7010         mmap_event->event_id.header.type = type;
7011 }
7012
7013 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
7014 {
7015         struct vm_area_struct *vma = mmap_event->vma;
7016         struct file *file = vma->vm_file;
7017         int maj = 0, min = 0;
7018         u64 ino = 0, gen = 0;
7019         u32 prot = 0, flags = 0;
7020         unsigned int size;
7021         char tmp[16];
7022         char *buf = NULL;
7023         char *name;
7024
7025         if (vma->vm_flags & VM_READ)
7026                 prot |= PROT_READ;
7027         if (vma->vm_flags & VM_WRITE)
7028                 prot |= PROT_WRITE;
7029         if (vma->vm_flags & VM_EXEC)
7030                 prot |= PROT_EXEC;
7031
7032         if (vma->vm_flags & VM_MAYSHARE)
7033                 flags = MAP_SHARED;
7034         else
7035                 flags = MAP_PRIVATE;
7036
7037         if (vma->vm_flags & VM_DENYWRITE)
7038                 flags |= MAP_DENYWRITE;
7039         if (vma->vm_flags & VM_MAYEXEC)
7040                 flags |= MAP_EXECUTABLE;
7041         if (vma->vm_flags & VM_LOCKED)
7042                 flags |= MAP_LOCKED;
7043         if (vma->vm_flags & VM_HUGETLB)
7044                 flags |= MAP_HUGETLB;
7045
7046         if (file) {
7047                 struct inode *inode;
7048                 dev_t dev;
7049
7050                 buf = kmalloc(PATH_MAX, GFP_KERNEL);
7051                 if (!buf) {
7052                         name = "//enomem";
7053                         goto cpy_name;
7054                 }
7055                 /*
7056                  * d_path() works from the end of the rb backwards, so we
7057                  * need to add enough zero bytes after the string to handle
7058                  * the 64bit alignment we do later.
7059                  */
7060                 name = file_path(file, buf, PATH_MAX - sizeof(u64));
7061                 if (IS_ERR(name)) {
7062                         name = "//toolong";
7063                         goto cpy_name;
7064                 }
7065                 inode = file_inode(vma->vm_file);
7066                 dev = inode->i_sb->s_dev;
7067                 ino = inode->i_ino;
7068                 gen = inode->i_generation;
7069                 maj = MAJOR(dev);
7070                 min = MINOR(dev);
7071
7072                 goto got_name;
7073         } else {
7074                 if (vma->vm_ops && vma->vm_ops->name) {
7075                         name = (char *) vma->vm_ops->name(vma);
7076                         if (name)
7077                                 goto cpy_name;
7078                 }
7079
7080                 name = (char *)arch_vma_name(vma);
7081                 if (name)
7082                         goto cpy_name;
7083
7084                 if (vma->vm_start <= vma->vm_mm->start_brk &&
7085                                 vma->vm_end >= vma->vm_mm->brk) {
7086                         name = "[heap]";
7087                         goto cpy_name;
7088                 }
7089                 if (vma->vm_start <= vma->vm_mm->start_stack &&
7090                                 vma->vm_end >= vma->vm_mm->start_stack) {
7091                         name = "[stack]";
7092                         goto cpy_name;
7093                 }
7094
7095                 name = "//anon";
7096                 goto cpy_name;
7097         }
7098
7099 cpy_name:
7100         strlcpy(tmp, name, sizeof(tmp));
7101         name = tmp;
7102 got_name:
7103         /*
7104          * Since our buffer works in 8 byte units we need to align our string
7105          * size to a multiple of 8. However, we must guarantee the tail end is
7106          * zero'd out to avoid leaking random bits to userspace.
7107          */
7108         size = strlen(name)+1;
7109         while (!IS_ALIGNED(size, sizeof(u64)))
7110                 name[size++] = '\0';
7111
7112         mmap_event->file_name = name;
7113         mmap_event->file_size = size;
7114         mmap_event->maj = maj;
7115         mmap_event->min = min;
7116         mmap_event->ino = ino;
7117         mmap_event->ino_generation = gen;
7118         mmap_event->prot = prot;
7119         mmap_event->flags = flags;
7120
7121         if (!(vma->vm_flags & VM_EXEC))
7122                 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
7123
7124         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
7125
7126         perf_iterate_sb(perf_event_mmap_output,
7127                        mmap_event,
7128                        NULL);
7129
7130         kfree(buf);
7131 }
7132
7133 /*
7134  * Check whether inode and address range match filter criteria.
7135  */
7136 static bool perf_addr_filter_match(struct perf_addr_filter *filter,
7137                                      struct file *file, unsigned long offset,
7138                                      unsigned long size)
7139 {
7140         /* d_inode(NULL) won't be equal to any mapped user-space file */
7141         if (!filter->path.dentry)
7142                 return false;
7143
7144         if (d_inode(filter->path.dentry) != file_inode(file))
7145                 return false;
7146
7147         if (filter->offset > offset + size)
7148                 return false;
7149
7150         if (filter->offset + filter->size < offset)
7151                 return false;
7152
7153         return true;
7154 }
7155
7156 static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
7157 {
7158         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
7159         struct vm_area_struct *vma = data;
7160         unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags;
7161         struct file *file = vma->vm_file;
7162         struct perf_addr_filter *filter;
7163         unsigned int restart = 0, count = 0;
7164
7165         if (!has_addr_filter(event))
7166                 return;
7167
7168         if (!file)
7169                 return;
7170
7171         raw_spin_lock_irqsave(&ifh->lock, flags);
7172         list_for_each_entry(filter, &ifh->list, entry) {
7173                 if (perf_addr_filter_match(filter, file, off,
7174                                              vma->vm_end - vma->vm_start)) {
7175                         event->addr_filters_offs[count] = vma->vm_start;
7176                         restart++;
7177                 }
7178
7179                 count++;
7180         }
7181
7182         if (restart)
7183                 event->addr_filters_gen++;
7184         raw_spin_unlock_irqrestore(&ifh->lock, flags);
7185
7186         if (restart)
7187                 perf_event_stop(event, 1);
7188 }
7189
7190 /*
7191  * Adjust all task's events' filters to the new vma
7192  */
7193 static void perf_addr_filters_adjust(struct vm_area_struct *vma)
7194 {
7195         struct perf_event_context *ctx;
7196         int ctxn;
7197
7198         /*
7199          * Data tracing isn't supported yet and as such there is no need
7200          * to keep track of anything that isn't related to executable code:
7201          */
7202         if (!(vma->vm_flags & VM_EXEC))
7203                 return;
7204
7205         rcu_read_lock();
7206         for_each_task_context_nr(ctxn) {
7207                 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
7208                 if (!ctx)
7209                         continue;
7210
7211                 perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
7212         }
7213         rcu_read_unlock();
7214 }
7215
7216 void perf_event_mmap(struct vm_area_struct *vma)
7217 {
7218         struct perf_mmap_event mmap_event;
7219
7220         if (!atomic_read(&nr_mmap_events))
7221                 return;
7222
7223         mmap_event = (struct perf_mmap_event){
7224                 .vma    = vma,
7225                 /* .file_name */
7226                 /* .file_size */
7227                 .event_id  = {
7228                         .header = {
7229                                 .type = PERF_RECORD_MMAP,
7230                                 .misc = PERF_RECORD_MISC_USER,
7231                                 /* .size */
7232                         },
7233                         /* .pid */
7234                         /* .tid */
7235                         .start  = vma->vm_start,
7236                         .len    = vma->vm_end - vma->vm_start,
7237                         .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
7238                 },
7239                 /* .maj (attr_mmap2 only) */
7240                 /* .min (attr_mmap2 only) */
7241                 /* .ino (attr_mmap2 only) */
7242                 /* .ino_generation (attr_mmap2 only) */
7243                 /* .prot (attr_mmap2 only) */
7244                 /* .flags (attr_mmap2 only) */
7245         };
7246
7247         perf_addr_filters_adjust(vma);
7248         perf_event_mmap_event(&mmap_event);
7249 }
7250
7251 void perf_event_aux_event(struct perf_event *event, unsigned long head,
7252                           unsigned long size, u64 flags)
7253 {
7254         struct perf_output_handle handle;
7255         struct perf_sample_data sample;
7256         struct perf_aux_event {
7257                 struct perf_event_header        header;
7258                 u64                             offset;
7259                 u64                             size;
7260                 u64                             flags;
7261         } rec = {
7262                 .header = {
7263                         .type = PERF_RECORD_AUX,
7264                         .misc = 0,
7265                         .size = sizeof(rec),
7266                 },
7267                 .offset         = head,
7268                 .size           = size,
7269                 .flags          = flags,
7270         };
7271         int ret;
7272
7273         perf_event_header__init_id(&rec.header, &sample, event);
7274         ret = perf_output_begin(&handle, event, rec.header.size);
7275
7276         if (ret)
7277                 return;
7278
7279         perf_output_put(&handle, rec);
7280         perf_event__output_id_sample(event, &handle, &sample);
7281
7282         perf_output_end(&handle);
7283 }
7284
7285 /*
7286  * Lost/dropped samples logging
7287  */
7288 void perf_log_lost_samples(struct perf_event *event, u64 lost)
7289 {
7290         struct perf_output_handle handle;
7291         struct perf_sample_data sample;
7292         int ret;
7293
7294         struct {
7295                 struct perf_event_header        header;
7296                 u64                             lost;
7297         } lost_samples_event = {
7298                 .header = {
7299                         .type = PERF_RECORD_LOST_SAMPLES,
7300                         .misc = 0,
7301                         .size = sizeof(lost_samples_event),
7302                 },
7303                 .lost           = lost,
7304         };
7305
7306         perf_event_header__init_id(&lost_samples_event.header, &sample, event);
7307
7308         ret = perf_output_begin(&handle, event,
7309                                 lost_samples_event.header.size);
7310         if (ret)
7311                 return;
7312
7313         perf_output_put(&handle, lost_samples_event);
7314         perf_event__output_id_sample(event, &handle, &sample);
7315         perf_output_end(&handle);
7316 }
7317
7318 /*
7319  * context_switch tracking
7320  */
7321
7322 struct perf_switch_event {
7323         struct task_struct      *task;
7324         struct task_struct      *next_prev;
7325
7326         struct {
7327                 struct perf_event_header        header;
7328                 u32                             next_prev_pid;
7329                 u32                             next_prev_tid;
7330         } event_id;
7331 };
7332
7333 static int perf_event_switch_match(struct perf_event *event)
7334 {
7335         return event->attr.context_switch;
7336 }
7337
7338 static void perf_event_switch_output(struct perf_event *event, void *data)
7339 {
7340         struct perf_switch_event *se = data;
7341         struct perf_output_handle handle;
7342         struct perf_sample_data sample;
7343         int ret;
7344
7345         if (!perf_event_switch_match(event))
7346                 return;
7347
7348         /* Only CPU-wide events are allowed to see next/prev pid/tid */
7349         if (event->ctx->task) {
7350                 se->event_id.header.type = PERF_RECORD_SWITCH;
7351                 se->event_id.header.size = sizeof(se->event_id.header);
7352         } else {
7353                 se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
7354                 se->event_id.header.size = sizeof(se->event_id);
7355                 se->event_id.next_prev_pid =
7356                                         perf_event_pid(event, se->next_prev);
7357                 se->event_id.next_prev_tid =
7358                                         perf_event_tid(event, se->next_prev);
7359         }
7360
7361         perf_event_header__init_id(&se->event_id.header, &sample, event);
7362
7363         ret = perf_output_begin(&handle, event, se->event_id.header.size);
7364         if (ret)
7365                 return;
7366
7367         if (event->ctx->task)
7368                 perf_output_put(&handle, se->event_id.header);
7369         else
7370                 perf_output_put(&handle, se->event_id);
7371
7372         perf_event__output_id_sample(event, &handle, &sample);
7373
7374         perf_output_end(&handle);
7375 }
7376
7377 static void perf_event_switch(struct task_struct *task,
7378                               struct task_struct *next_prev, bool sched_in)
7379 {
7380         struct perf_switch_event switch_event;
7381
7382         /* N.B. caller checks nr_switch_events != 0 */
7383
7384         switch_event = (struct perf_switch_event){
7385                 .task           = task,
7386                 .next_prev      = next_prev,
7387                 .event_id       = {
7388                         .header = {
7389                                 /* .type */
7390                                 .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
7391                                 /* .size */
7392                         },
7393                         /* .next_prev_pid */
7394                         /* .next_prev_tid */
7395                 },
7396         };
7397
7398         perf_iterate_sb(perf_event_switch_output,
7399                        &switch_event,
7400                        NULL);
7401 }
7402
7403 /*
7404  * IRQ throttle logging
7405  */
7406
7407 static void perf_log_throttle(struct perf_event *event, int enable)
7408 {
7409         struct perf_output_handle handle;
7410         struct perf_sample_data sample;
7411         int ret;
7412
7413         struct {
7414                 struct perf_event_header        header;
7415                 u64                             time;
7416                 u64                             id;
7417                 u64                             stream_id;
7418         } throttle_event = {
7419                 .header = {
7420                         .type = PERF_RECORD_THROTTLE,
7421                         .misc = 0,
7422                         .size = sizeof(throttle_event),
7423                 },
7424                 .time           = perf_event_clock(event),
7425                 .id             = primary_event_id(event),
7426                 .stream_id      = event->id,
7427         };
7428
7429         if (enable)
7430                 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
7431
7432         perf_event_header__init_id(&throttle_event.header, &sample, event);
7433
7434         ret = perf_output_begin(&handle, event,
7435                                 throttle_event.header.size);
7436         if (ret)
7437                 return;
7438
7439         perf_output_put(&handle, throttle_event);
7440         perf_event__output_id_sample(event, &handle, &sample);
7441         perf_output_end(&handle);
7442 }
7443
7444 void perf_event_itrace_started(struct perf_event *event)
7445 {
7446         event->attach_state |= PERF_ATTACH_ITRACE;
7447 }
7448
7449 static void perf_log_itrace_start(struct perf_event *event)
7450 {
7451         struct perf_output_handle handle;
7452         struct perf_sample_data sample;
7453         struct perf_aux_event {
7454                 struct perf_event_header        header;
7455                 u32                             pid;
7456                 u32                             tid;
7457         } rec;
7458         int ret;
7459
7460         if (event->parent)
7461                 event = event->parent;
7462
7463         if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
7464             event->attach_state & PERF_ATTACH_ITRACE)
7465                 return;
7466
7467         rec.header.type = PERF_RECORD_ITRACE_START;
7468         rec.header.misc = 0;
7469         rec.header.size = sizeof(rec);
7470         rec.pid = perf_event_pid(event, current);
7471         rec.tid = perf_event_tid(event, current);
7472
7473         perf_event_header__init_id(&rec.header, &sample, event);
7474         ret = perf_output_begin(&handle, event, rec.header.size);
7475
7476         if (ret)
7477                 return;
7478
7479         perf_output_put(&handle, rec);
7480         perf_event__output_id_sample(event, &handle, &sample);
7481
7482         perf_output_end(&handle);
7483 }
7484
7485 static int
7486 __perf_event_account_interrupt(struct perf_event *event, int throttle)
7487 {
7488         struct hw_perf_event *hwc = &event->hw;
7489         int ret = 0;
7490         u64 seq;
7491
7492         seq = __this_cpu_read(perf_throttled_seq);
7493         if (seq != hwc->interrupts_seq) {
7494                 hwc->interrupts_seq = seq;
7495                 hwc->interrupts = 1;
7496         } else {
7497                 hwc->interrupts++;
7498                 if (unlikely(throttle &&
7499                              hwc->interrupts > max_samples_per_tick)) {
7500                         __this_cpu_inc(perf_throttled_count);
7501                         tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
7502                         hwc->interrupts = MAX_INTERRUPTS;
7503                         perf_log_throttle(event, 0);
7504                         ret = 1;
7505                 }
7506         }
7507
7508         if (event->attr.freq) {
7509                 u64 now = perf_clock();
7510                 s64 delta = now - hwc->freq_time_stamp;
7511
7512                 hwc->freq_time_stamp = now;
7513
7514                 if (delta > 0 && delta < 2*TICK_NSEC)
7515                         perf_adjust_period(event, delta, hwc->last_period, true);
7516         }
7517
7518         return ret;
7519 }
7520
7521 int perf_event_account_interrupt(struct perf_event *event)
7522 {
7523         return __perf_event_account_interrupt(event, 1);
7524 }
7525
7526 /*
7527  * Generic event overflow handling, sampling.
7528  */
7529
7530 static int __perf_event_overflow(struct perf_event *event,
7531                                    int throttle, struct perf_sample_data *data,
7532                                    struct pt_regs *regs)
7533 {
7534         int events = atomic_read(&event->event_limit);
7535         int ret = 0;
7536
7537         /*
7538          * Non-sampling counters might still use the PMI to fold short
7539          * hardware counters, ignore those.
7540          */
7541         if (unlikely(!is_sampling_event(event)))
7542                 return 0;
7543
7544         ret = __perf_event_account_interrupt(event, throttle);
7545
7546         /*
7547          * XXX event_limit might not quite work as expected on inherited
7548          * events
7549          */
7550
7551         event->pending_kill = POLL_IN;
7552         if (events && atomic_dec_and_test(&event->event_limit)) {
7553                 ret = 1;
7554                 event->pending_kill = POLL_HUP;
7555
7556                 perf_event_disable_inatomic(event);
7557         }
7558
7559         READ_ONCE(event->overflow_handler)(event, data, regs);
7560
7561         if (*perf_event_fasync(event) && event->pending_kill) {
7562                 event->pending_wakeup = 1;
7563                 irq_work_queue(&event->pending);
7564         }
7565
7566         return ret;
7567 }
7568
7569 int perf_event_overflow(struct perf_event *event,
7570                           struct perf_sample_data *data,
7571                           struct pt_regs *regs)
7572 {
7573         return __perf_event_overflow(event, 1, data, regs);
7574 }
7575
7576 /*
7577  * Generic software event infrastructure
7578  */
7579
7580 struct swevent_htable {
7581         struct swevent_hlist            *swevent_hlist;
7582         struct mutex                    hlist_mutex;
7583         int                             hlist_refcount;
7584
7585         /* Recursion avoidance in each contexts */
7586         int                             recursion[PERF_NR_CONTEXTS];
7587 };
7588
7589 static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
7590
7591 /*
7592  * We directly increment event->count and keep a second value in
7593  * event->hw.period_left to count intervals. This period event
7594  * is kept in the range [-sample_period, 0] so that we can use the
7595  * sign as trigger.
7596  */
7597
7598 u64 perf_swevent_set_period(struct perf_event *event)
7599 {
7600         struct hw_perf_event *hwc = &event->hw;
7601         u64 period = hwc->last_period;
7602         u64 nr, offset;
7603         s64 old, val;
7604
7605         hwc->last_period = hwc->sample_period;
7606
7607 again:
7608         old = val = local64_read(&hwc->period_left);
7609         if (val < 0)
7610                 return 0;
7611
7612         nr = div64_u64(period + val, period);
7613         offset = nr * period;
7614         val -= offset;
7615         if (local64_cmpxchg(&hwc->period_left, old, val) != old)
7616                 goto again;
7617
7618         return nr;
7619 }
7620
7621 static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
7622                                     struct perf_sample_data *data,
7623                                     struct pt_regs *regs)
7624 {
7625         struct hw_perf_event *hwc = &event->hw;
7626         int throttle = 0;
7627
7628         if (!overflow)
7629                 overflow = perf_swevent_set_period(event);
7630
7631         if (hwc->interrupts == MAX_INTERRUPTS)
7632                 return;
7633
7634         for (; overflow; overflow--) {
7635                 if (__perf_event_overflow(event, throttle,
7636                                             data, regs)) {
7637                         /*
7638                          * We inhibit the overflow from happening when
7639                          * hwc->interrupts == MAX_INTERRUPTS.
7640                          */
7641                         break;
7642                 }
7643                 throttle = 1;
7644         }
7645 }
7646
7647 static void perf_swevent_event(struct perf_event *event, u64 nr,
7648                                struct perf_sample_data *data,
7649                                struct pt_regs *regs)
7650 {
7651         struct hw_perf_event *hwc = &event->hw;
7652
7653         local64_add(nr, &event->count);
7654
7655         if (!regs)
7656                 return;
7657
7658         if (!is_sampling_event(event))
7659                 return;
7660
7661         if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
7662                 data->period = nr;
7663                 return perf_swevent_overflow(event, 1, data, regs);
7664         } else
7665                 data->period = event->hw.last_period;
7666
7667         if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
7668                 return perf_swevent_overflow(event, 1, data, regs);
7669
7670         if (local64_add_negative(nr, &hwc->period_left))
7671                 return;
7672
7673         perf_swevent_overflow(event, 0, data, regs);
7674 }
7675
7676 static int perf_exclude_event(struct perf_event *event,
7677                               struct pt_regs *regs)
7678 {
7679         if (event->hw.state & PERF_HES_STOPPED)
7680                 return 1;
7681
7682         if (regs) {
7683                 if (event->attr.exclude_user && user_mode(regs))
7684                         return 1;
7685
7686                 if (event->attr.exclude_kernel && !user_mode(regs))
7687                         return 1;
7688         }
7689
7690         return 0;
7691 }
7692
7693 static int perf_swevent_match(struct perf_event *event,
7694                                 enum perf_type_id type,
7695                                 u32 event_id,
7696                                 struct perf_sample_data *data,
7697                                 struct pt_regs *regs)
7698 {
7699         if (event->attr.type != type)
7700                 return 0;
7701
7702         if (event->attr.config != event_id)
7703                 return 0;
7704
7705         if (perf_exclude_event(event, regs))
7706                 return 0;
7707
7708         return 1;
7709 }
7710
7711 static inline u64 swevent_hash(u64 type, u32 event_id)
7712 {
7713         u64 val = event_id | (type << 32);
7714
7715         return hash_64(val, SWEVENT_HLIST_BITS);
7716 }
7717
7718 static inline struct hlist_head *
7719 __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
7720 {
7721         u64 hash = swevent_hash(type, event_id);
7722
7723         return &hlist->heads[hash];
7724 }
7725
7726 /* For the read side: events when they trigger */
7727 static inline struct hlist_head *
7728 find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
7729 {
7730         struct swevent_hlist *hlist;
7731
7732         hlist = rcu_dereference(swhash->swevent_hlist);
7733         if (!hlist)
7734                 return NULL;
7735
7736         return __find_swevent_head(hlist, type, event_id);
7737 }
7738
7739 /* For the event head insertion and removal in the hlist */
7740 static inline struct hlist_head *
7741 find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
7742 {
7743         struct swevent_hlist *hlist;
7744         u32 event_id = event->attr.config;
7745         u64 type = event->attr.type;
7746
7747         /*
7748          * Event scheduling is always serialized against hlist allocation
7749          * and release. Which makes the protected version suitable here.
7750          * The context lock guarantees that.
7751          */
7752         hlist = rcu_dereference_protected(swhash->swevent_hlist,
7753                                           lockdep_is_held(&event->ctx->lock));
7754         if (!hlist)
7755                 return NULL;
7756
7757         return __find_swevent_head(hlist, type, event_id);
7758 }
7759
7760 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
7761                                     u64 nr,
7762                                     struct perf_sample_data *data,
7763                                     struct pt_regs *regs)
7764 {
7765         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7766         struct perf_event *event;
7767         struct hlist_head *head;
7768
7769         rcu_read_lock();
7770         head = find_swevent_head_rcu(swhash, type, event_id);
7771         if (!head)
7772                 goto end;
7773
7774         hlist_for_each_entry_rcu(event, head, hlist_entry) {
7775                 if (perf_swevent_match(event, type, event_id, data, regs))
7776                         perf_swevent_event(event, nr, data, regs);
7777         }
7778 end:
7779         rcu_read_unlock();
7780 }
7781
7782 DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
7783
7784 int perf_swevent_get_recursion_context(void)
7785 {
7786         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7787
7788         return get_recursion_context(swhash->recursion);
7789 }
7790 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
7791
7792 void perf_swevent_put_recursion_context(int rctx)
7793 {
7794         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7795
7796         put_recursion_context(swhash->recursion, rctx);
7797 }
7798
7799 void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
7800 {
7801         struct perf_sample_data data;
7802
7803         if (WARN_ON_ONCE(!regs))
7804                 return;
7805
7806         perf_sample_data_init(&data, addr, 0);
7807         do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
7808 }
7809
7810 void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
7811 {
7812         int rctx;
7813
7814         preempt_disable_notrace();
7815         rctx = perf_swevent_get_recursion_context();
7816         if (unlikely(rctx < 0))
7817                 goto fail;
7818
7819         ___perf_sw_event(event_id, nr, regs, addr);
7820
7821         perf_swevent_put_recursion_context(rctx);
7822 fail:
7823         preempt_enable_notrace();
7824 }
7825
7826 static void perf_swevent_read(struct perf_event *event)
7827 {
7828 }
7829
7830 static int perf_swevent_add(struct perf_event *event, int flags)
7831 {
7832         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7833         struct hw_perf_event *hwc = &event->hw;
7834         struct hlist_head *head;
7835
7836         if (is_sampling_event(event)) {
7837                 hwc->last_period = hwc->sample_period;
7838                 perf_swevent_set_period(event);
7839         }
7840
7841         hwc->state = !(flags & PERF_EF_START);
7842
7843         head = find_swevent_head(swhash, event);
7844         if (WARN_ON_ONCE(!head))
7845                 return -EINVAL;
7846
7847         hlist_add_head_rcu(&event->hlist_entry, head);
7848         perf_event_update_userpage(event);
7849
7850         return 0;
7851 }
7852
7853 static void perf_swevent_del(struct perf_event *event, int flags)
7854 {
7855         hlist_del_rcu(&event->hlist_entry);
7856 }
7857
7858 static void perf_swevent_start(struct perf_event *event, int flags)
7859 {
7860         event->hw.state = 0;
7861 }
7862
7863 static void perf_swevent_stop(struct perf_event *event, int flags)
7864 {
7865         event->hw.state = PERF_HES_STOPPED;
7866 }
7867
7868 /* Deref the hlist from the update side */
7869 static inline struct swevent_hlist *
7870 swevent_hlist_deref(struct swevent_htable *swhash)
7871 {
7872         return rcu_dereference_protected(swhash->swevent_hlist,
7873                                          lockdep_is_held(&swhash->hlist_mutex));
7874 }
7875
7876 static void swevent_hlist_release(struct swevent_htable *swhash)
7877 {
7878         struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
7879
7880         if (!hlist)
7881                 return;
7882
7883         RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
7884         kfree_rcu(hlist, rcu_head);
7885 }
7886
7887 static void swevent_hlist_put_cpu(int cpu)
7888 {
7889         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7890
7891         mutex_lock(&swhash->hlist_mutex);
7892
7893         if (!--swhash->hlist_refcount)
7894                 swevent_hlist_release(swhash);
7895
7896         mutex_unlock(&swhash->hlist_mutex);
7897 }
7898
7899 static void swevent_hlist_put(void)
7900 {
7901         int cpu;
7902
7903         for_each_possible_cpu(cpu)
7904                 swevent_hlist_put_cpu(cpu);
7905 }
7906
7907 static int swevent_hlist_get_cpu(int cpu)
7908 {
7909         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7910         int err = 0;
7911
7912         mutex_lock(&swhash->hlist_mutex);
7913         if (!swevent_hlist_deref(swhash) &&
7914             cpumask_test_cpu(cpu, perf_online_mask)) {
7915                 struct swevent_hlist *hlist;
7916
7917                 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
7918                 if (!hlist) {
7919                         err = -ENOMEM;
7920                         goto exit;
7921                 }
7922                 rcu_assign_pointer(swhash->swevent_hlist, hlist);
7923         }
7924         swhash->hlist_refcount++;
7925 exit:
7926         mutex_unlock(&swhash->hlist_mutex);
7927
7928         return err;
7929 }
7930
7931 static int swevent_hlist_get(void)
7932 {
7933         int err, cpu, failed_cpu;
7934
7935         mutex_lock(&pmus_lock);
7936         for_each_possible_cpu(cpu) {
7937                 err = swevent_hlist_get_cpu(cpu);
7938                 if (err) {
7939                         failed_cpu = cpu;
7940                         goto fail;
7941                 }
7942         }
7943         mutex_unlock(&pmus_lock);
7944         return 0;
7945 fail:
7946         for_each_possible_cpu(cpu) {
7947                 if (cpu == failed_cpu)
7948                         break;
7949                 swevent_hlist_put_cpu(cpu);
7950         }
7951         mutex_unlock(&pmus_lock);
7952         return err;
7953 }
7954
7955 struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
7956
7957 static void sw_perf_event_destroy(struct perf_event *event)
7958 {
7959         u64 event_id = event->attr.config;
7960
7961         WARN_ON(event->parent);
7962
7963         static_key_slow_dec(&perf_swevent_enabled[event_id]);
7964         swevent_hlist_put();
7965 }
7966
7967 static int perf_swevent_init(struct perf_event *event)
7968 {
7969         u64 event_id = event->attr.config;
7970
7971         if (event->attr.type != PERF_TYPE_SOFTWARE)
7972                 return -ENOENT;
7973
7974         /*
7975          * no branch sampling for software events
7976          */
7977         if (has_branch_stack(event))
7978                 return -EOPNOTSUPP;
7979
7980         switch (event_id) {
7981         case PERF_COUNT_SW_CPU_CLOCK:
7982         case PERF_COUNT_SW_TASK_CLOCK:
7983                 return -ENOENT;
7984
7985         default:
7986                 break;
7987         }
7988
7989         if (event_id >= PERF_COUNT_SW_MAX)
7990                 return -ENOENT;
7991
7992         if (!event->parent) {
7993                 int err;
7994
7995                 err = swevent_hlist_get();
7996                 if (err)
7997                         return err;
7998
7999                 static_key_slow_inc(&perf_swevent_enabled[event_id]);
8000                 event->destroy = sw_perf_event_destroy;
8001         }
8002
8003         return 0;
8004 }
8005
8006 static struct pmu perf_swevent = {
8007         .task_ctx_nr    = perf_sw_context,
8008
8009         .capabilities   = PERF_PMU_CAP_NO_NMI,
8010
8011         .event_init     = perf_swevent_init,
8012         .add            = perf_swevent_add,
8013         .del            = perf_swevent_del,
8014         .start          = perf_swevent_start,
8015         .stop           = perf_swevent_stop,
8016         .read           = perf_swevent_read,
8017 };
8018
8019 #ifdef CONFIG_EVENT_TRACING
8020
8021 static int perf_tp_filter_match(struct perf_event *event,
8022                                 struct perf_sample_data *data)
8023 {
8024         void *record = data->raw->frag.data;
8025
8026         /* only top level events have filters set */
8027         if (event->parent)
8028                 event = event->parent;
8029
8030         if (likely(!event->filter) || filter_match_preds(event->filter, record))
8031                 return 1;
8032         return 0;
8033 }
8034
8035 static int perf_tp_event_match(struct perf_event *event,
8036                                 struct perf_sample_data *data,
8037                                 struct pt_regs *regs)
8038 {
8039         if (event->hw.state & PERF_HES_STOPPED)
8040                 return 0;
8041         /*
8042          * All tracepoints are from kernel-space.
8043          */
8044         if (event->attr.exclude_kernel)
8045                 return 0;
8046
8047         if (!perf_tp_filter_match(event, data))
8048                 return 0;
8049
8050         return 1;
8051 }
8052
8053 void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
8054                                struct trace_event_call *call, u64 count,
8055                                struct pt_regs *regs, struct hlist_head *head,
8056                                struct task_struct *task)
8057 {
8058         struct bpf_prog *prog = call->prog;
8059
8060         if (prog) {
8061                 *(struct pt_regs **)raw_data = regs;
8062                 if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) {
8063                         perf_swevent_put_recursion_context(rctx);
8064                         return;
8065                 }
8066         }
8067         perf_tp_event(call->event.type, count, raw_data, size, regs, head,
8068                       rctx, task, NULL);
8069 }
8070 EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
8071
8072 void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
8073                    struct pt_regs *regs, struct hlist_head *head, int rctx,
8074                    struct task_struct *task, struct perf_event *event)
8075 {
8076         struct perf_sample_data data;
8077
8078         struct perf_raw_record raw = {
8079                 .frag = {
8080                         .size = entry_size,
8081                         .data = record,
8082                 },
8083         };
8084
8085         perf_sample_data_init(&data, 0, 0);
8086         data.raw = &raw;
8087
8088         perf_trace_buf_update(record, event_type);
8089
8090         /* Use the given event instead of the hlist */
8091         if (event) {
8092                 if (perf_tp_event_match(event, &data, regs))
8093                         perf_swevent_event(event, count, &data, regs);
8094         } else {
8095                 hlist_for_each_entry_rcu(event, head, hlist_entry) {
8096                         if (perf_tp_event_match(event, &data, regs))
8097                                 perf_swevent_event(event, count, &data, regs);
8098                 }
8099         }
8100
8101         /*
8102          * If we got specified a target task, also iterate its context and
8103          * deliver this event there too.
8104          */
8105         if (task && task != current) {
8106                 struct perf_event_context *ctx;
8107                 struct trace_entry *entry = record;
8108
8109                 rcu_read_lock();
8110                 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
8111                 if (!ctx)
8112                         goto unlock;
8113
8114                 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
8115                         if (event->cpu != smp_processor_id())
8116                                 continue;
8117                         if (event->attr.type != PERF_TYPE_TRACEPOINT)
8118                                 continue;
8119                         if (event->attr.config != entry->type)
8120                                 continue;
8121                         if (perf_tp_event_match(event, &data, regs))
8122                                 perf_swevent_event(event, count, &data, regs);
8123                 }
8124 unlock:
8125                 rcu_read_unlock();
8126         }
8127
8128         perf_swevent_put_recursion_context(rctx);
8129 }
8130 EXPORT_SYMBOL_GPL(perf_tp_event);
8131
8132 static void tp_perf_event_destroy(struct perf_event *event)
8133 {
8134         perf_trace_destroy(event);
8135 }
8136
8137 static int perf_tp_event_init(struct perf_event *event)
8138 {
8139         int err;
8140
8141         if (event->attr.type != PERF_TYPE_TRACEPOINT)
8142                 return -ENOENT;
8143
8144         /*
8145          * no branch sampling for tracepoint events
8146          */
8147         if (has_branch_stack(event))
8148                 return -EOPNOTSUPP;
8149
8150         err = perf_trace_init(event);
8151         if (err)
8152                 return err;
8153
8154         event->destroy = tp_perf_event_destroy;
8155
8156         return 0;
8157 }
8158
8159 static struct pmu perf_tracepoint = {
8160         .task_ctx_nr    = perf_sw_context,
8161
8162         .event_init     = perf_tp_event_init,
8163         .add            = perf_trace_add,
8164         .del            = perf_trace_del,
8165         .start          = perf_swevent_start,
8166         .stop           = perf_swevent_stop,
8167         .read           = perf_swevent_read,
8168 };
8169
8170 static inline void perf_tp_register(void)
8171 {
8172         perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
8173 }
8174
8175 static void perf_event_free_filter(struct perf_event *event)
8176 {
8177         ftrace_profile_free_filter(event);
8178 }
8179
8180 #ifdef CONFIG_BPF_SYSCALL
8181 static void bpf_overflow_handler(struct perf_event *event,
8182                                  struct perf_sample_data *data,
8183                                  struct pt_regs *regs)
8184 {
8185         struct bpf_perf_event_data_kern ctx = {
8186                 .data = data,
8187                 .regs = regs,
8188         };
8189         int ret = 0;
8190
8191         preempt_disable();
8192         if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
8193                 goto out;
8194         rcu_read_lock();
8195         ret = BPF_PROG_RUN(event->prog, &ctx);
8196         rcu_read_unlock();
8197 out:
8198         __this_cpu_dec(bpf_prog_active);
8199         preempt_enable();
8200         if (!ret)
8201                 return;
8202
8203         event->orig_overflow_handler(event, data, regs);
8204 }
8205
8206 static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
8207 {
8208         struct bpf_prog *prog;
8209
8210         if (event->overflow_handler_context)
8211                 /* hw breakpoint or kernel counter */
8212                 return -EINVAL;
8213
8214         if (event->prog)
8215                 return -EEXIST;
8216
8217         prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
8218         if (IS_ERR(prog))
8219                 return PTR_ERR(prog);
8220
8221         event->prog = prog;
8222         event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
8223         WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
8224         return 0;
8225 }
8226
8227 static void perf_event_free_bpf_handler(struct perf_event *event)
8228 {
8229         struct bpf_prog *prog = event->prog;
8230
8231         if (!prog)
8232                 return;
8233
8234         WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
8235         event->prog = NULL;
8236         bpf_prog_put(prog);
8237 }
8238 #else
8239 static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
8240 {
8241         return -EOPNOTSUPP;
8242 }
8243 static void perf_event_free_bpf_handler(struct perf_event *event)
8244 {
8245 }
8246 #endif
8247
8248 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
8249 {
8250         bool is_kprobe, is_tracepoint, is_syscall_tp;
8251         struct bpf_prog *prog;
8252
8253         if (event->attr.type != PERF_TYPE_TRACEPOINT)
8254                 return perf_event_set_bpf_handler(event, prog_fd);
8255
8256         if (event->tp_event->prog)
8257                 return -EEXIST;
8258
8259         is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
8260         is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
8261         is_syscall_tp = is_syscall_trace_event(event->tp_event);
8262         if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
8263                 /* bpf programs can only be attached to u/kprobe or tracepoint */
8264                 return -EINVAL;
8265
8266         prog = bpf_prog_get(prog_fd);
8267         if (IS_ERR(prog))
8268                 return PTR_ERR(prog);
8269
8270         if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
8271             (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
8272             (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
8273                 /* valid fd, but invalid bpf program type */
8274                 bpf_prog_put(prog);
8275                 return -EINVAL;
8276         }
8277
8278         if (is_tracepoint || is_syscall_tp) {
8279                 int off = trace_event_get_offsets(event->tp_event);
8280
8281                 if (prog->aux->max_ctx_offset > off) {
8282                         bpf_prog_put(prog);
8283                         return -EACCES;
8284                 }
8285         }
8286         event->tp_event->prog = prog;
8287         event->tp_event->bpf_prog_owner = event;
8288
8289         return 0;
8290 }
8291
8292 static void perf_event_free_bpf_prog(struct perf_event *event)
8293 {
8294         struct bpf_prog *prog;
8295
8296         perf_event_free_bpf_handler(event);
8297
8298         if (!event->tp_event)
8299                 return;
8300
8301         prog = event->tp_event->prog;
8302         if (prog && event->tp_event->bpf_prog_owner == event) {
8303                 event->tp_event->prog = NULL;
8304                 bpf_prog_put(prog);
8305         }
8306 }
8307
8308 #else
8309
8310 static inline void perf_tp_register(void)
8311 {
8312 }
8313
8314 static void perf_event_free_filter(struct perf_event *event)
8315 {
8316 }
8317
8318 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
8319 {
8320         return -ENOENT;
8321 }
8322
8323 static void perf_event_free_bpf_prog(struct perf_event *event)
8324 {
8325 }
8326 #endif /* CONFIG_EVENT_TRACING */
8327
8328 #ifdef CONFIG_HAVE_HW_BREAKPOINT
8329 void perf_bp_event(struct perf_event *bp, void *data)
8330 {
8331         struct perf_sample_data sample;
8332         struct pt_regs *regs = data;
8333
8334         perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
8335
8336         if (!bp->hw.state && !perf_exclude_event(bp, regs))
8337                 perf_swevent_event(bp, 1, &sample, regs);
8338 }
8339 #endif
8340
8341 /*
8342  * Allocate a new address filter
8343  */
8344 static struct perf_addr_filter *
8345 perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
8346 {
8347         int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
8348         struct perf_addr_filter *filter;
8349
8350         filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
8351         if (!filter)
8352                 return NULL;
8353
8354         INIT_LIST_HEAD(&filter->entry);
8355         list_add_tail(&filter->entry, filters);
8356
8357         return filter;
8358 }
8359
8360 static void free_filters_list(struct list_head *filters)
8361 {
8362         struct perf_addr_filter *filter, *iter;
8363
8364         list_for_each_entry_safe(filter, iter, filters, entry) {
8365                 path_put(&filter->path);
8366                 list_del(&filter->entry);
8367                 kfree(filter);
8368         }
8369 }
8370
8371 /*
8372  * Free existing address filters and optionally install new ones
8373  */
8374 static void perf_addr_filters_splice(struct perf_event *event,
8375                                      struct list_head *head)
8376 {
8377         unsigned long flags;
8378         LIST_HEAD(list);
8379
8380         if (!has_addr_filter(event))
8381                 return;
8382
8383         /* don't bother with children, they don't have their own filters */
8384         if (event->parent)
8385                 return;
8386
8387         raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
8388
8389         list_splice_init(&event->addr_filters.list, &list);
8390         if (head)
8391                 list_splice(head, &event->addr_filters.list);
8392
8393         raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
8394
8395         free_filters_list(&list);
8396 }
8397
8398 /*
8399  * Scan through mm's vmas and see if one of them matches the
8400  * @filter; if so, adjust filter's address range.
8401  * Called with mm::mmap_sem down for reading.
8402  */
8403 static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter,
8404                                             struct mm_struct *mm)
8405 {
8406         struct vm_area_struct *vma;
8407
8408         for (vma = mm->mmap; vma; vma = vma->vm_next) {
8409                 struct file *file = vma->vm_file;
8410                 unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
8411                 unsigned long vma_size = vma->vm_end - vma->vm_start;
8412
8413                 if (!file)
8414                         continue;
8415
8416                 if (!perf_addr_filter_match(filter, file, off, vma_size))
8417                         continue;
8418
8419                 return vma->vm_start;
8420         }
8421
8422         return 0;
8423 }
8424
8425 /*
8426  * Update event's address range filters based on the
8427  * task's existing mappings, if any.
8428  */
8429 static void perf_event_addr_filters_apply(struct perf_event *event)
8430 {
8431         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
8432         struct task_struct *task = READ_ONCE(event->ctx->task);
8433         struct perf_addr_filter *filter;
8434         struct mm_struct *mm = NULL;
8435         unsigned int count = 0;
8436         unsigned long flags;
8437
8438         /*
8439          * We may observe TASK_TOMBSTONE, which means that the event tear-down
8440          * will stop on the parent's child_mutex that our caller is also holding
8441          */
8442         if (task == TASK_TOMBSTONE)
8443                 return;
8444
8445         if (!ifh->nr_file_filters)
8446                 return;
8447
8448         mm = get_task_mm(task);
8449         if (!mm)
8450                 goto restart;
8451
8452         down_read(&mm->mmap_sem);
8453
8454         raw_spin_lock_irqsave(&ifh->lock, flags);
8455         list_for_each_entry(filter, &ifh->list, entry) {
8456                 event->addr_filters_offs[count] = 0;
8457
8458                 /*
8459                  * Adjust base offset if the filter is associated to a binary
8460                  * that needs to be mapped:
8461                  */
8462                 if (filter->path.dentry)
8463                         event->addr_filters_offs[count] =
8464                                 perf_addr_filter_apply(filter, mm);
8465
8466                 count++;
8467         }
8468
8469         event->addr_filters_gen++;
8470         raw_spin_unlock_irqrestore(&ifh->lock, flags);
8471
8472         up_read(&mm->mmap_sem);
8473
8474         mmput(mm);
8475
8476 restart:
8477         perf_event_stop(event, 1);
8478 }
8479
8480 /*
8481  * Address range filtering: limiting the data to certain
8482  * instruction address ranges. Filters are ioctl()ed to us from
8483  * userspace as ascii strings.
8484  *
8485  * Filter string format:
8486  *
8487  * ACTION RANGE_SPEC
8488  * where ACTION is one of the
8489  *  * "filter": limit the trace to this region
8490  *  * "start": start tracing from this address
8491  *  * "stop": stop tracing at this address/region;
8492  * RANGE_SPEC is
8493  *  * for kernel addresses: <start address>[/<size>]
8494  *  * for object files:     <start address>[/<size>]@</path/to/object/file>
8495  *
8496  * if <size> is not specified, the range is treated as a single address.
8497  */
8498 enum {
8499         IF_ACT_NONE = -1,
8500         IF_ACT_FILTER,
8501         IF_ACT_START,
8502         IF_ACT_STOP,
8503         IF_SRC_FILE,
8504         IF_SRC_KERNEL,
8505         IF_SRC_FILEADDR,
8506         IF_SRC_KERNELADDR,
8507 };
8508
8509 enum {
8510         IF_STATE_ACTION = 0,
8511         IF_STATE_SOURCE,
8512         IF_STATE_END,
8513 };
8514
8515 static const match_table_t if_tokens = {
8516         { IF_ACT_FILTER,        "filter" },
8517         { IF_ACT_START,         "start" },
8518         { IF_ACT_STOP,          "stop" },
8519         { IF_SRC_FILE,          "%u/%u@%s" },
8520         { IF_SRC_KERNEL,        "%u/%u" },
8521         { IF_SRC_FILEADDR,      "%u@%s" },
8522         { IF_SRC_KERNELADDR,    "%u" },
8523         { IF_ACT_NONE,          NULL },
8524 };
8525
8526 /*
8527  * Address filter string parser
8528  */
8529 static int
8530 perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
8531                              struct list_head *filters)
8532 {
8533         struct perf_addr_filter *filter = NULL;
8534         char *start, *orig, *filename = NULL;
8535         substring_t args[MAX_OPT_ARGS];
8536         int state = IF_STATE_ACTION, token;
8537         unsigned int kernel = 0;
8538         int ret = -EINVAL;
8539
8540         orig = fstr = kstrdup(fstr, GFP_KERNEL);
8541         if (!fstr)
8542                 return -ENOMEM;
8543
8544         while ((start = strsep(&fstr, " ,\n")) != NULL) {
8545                 ret = -EINVAL;
8546
8547                 if (!*start)
8548                         continue;
8549
8550                 /* filter definition begins */
8551                 if (state == IF_STATE_ACTION) {
8552                         filter = perf_addr_filter_new(event, filters);
8553                         if (!filter)
8554                                 goto fail;
8555                 }
8556
8557                 token = match_token(start, if_tokens, args);
8558                 switch (token) {
8559                 case IF_ACT_FILTER:
8560                 case IF_ACT_START:
8561                         filter->filter = 1;
8562
8563                 case IF_ACT_STOP:
8564                         if (state != IF_STATE_ACTION)
8565                                 goto fail;
8566
8567                         state = IF_STATE_SOURCE;
8568                         break;
8569
8570                 case IF_SRC_KERNELADDR:
8571                 case IF_SRC_KERNEL:
8572                         kernel = 1;
8573
8574                 case IF_SRC_FILEADDR:
8575                 case IF_SRC_FILE:
8576                         if (state != IF_STATE_SOURCE)
8577                                 goto fail;
8578
8579                         if (token == IF_SRC_FILE || token == IF_SRC_KERNEL)
8580                                 filter->range = 1;
8581
8582                         *args[0].to = 0;
8583                         ret = kstrtoul(args[0].from, 0, &filter->offset);
8584                         if (ret)
8585                                 goto fail;
8586
8587                         if (filter->range) {
8588                                 *args[1].to = 0;
8589                                 ret = kstrtoul(args[1].from, 0, &filter->size);
8590                                 if (ret)
8591                                         goto fail;
8592                         }
8593
8594                         if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
8595                                 int fpos = filter->range ? 2 : 1;
8596
8597                                 kfree(filename);
8598                                 filename = match_strdup(&args[fpos]);
8599                                 if (!filename) {
8600                                         ret = -ENOMEM;
8601                                         goto fail;
8602                                 }
8603                         }
8604
8605                         state = IF_STATE_END;
8606                         break;
8607
8608                 default:
8609                         goto fail;
8610                 }
8611
8612                 /*
8613                  * Filter definition is fully parsed, validate and install it.
8614                  * Make sure that it doesn't contradict itself or the event's
8615                  * attribute.
8616                  */
8617                 if (state == IF_STATE_END) {
8618                         ret = -EINVAL;
8619                         if (kernel && event->attr.exclude_kernel)
8620                                 goto fail;
8621
8622                         if (!kernel) {
8623                                 if (!filename)
8624                                         goto fail;
8625
8626                                 /*
8627                                  * For now, we only support file-based filters
8628                                  * in per-task events; doing so for CPU-wide
8629                                  * events requires additional context switching
8630                                  * trickery, since same object code will be
8631                                  * mapped at different virtual addresses in
8632                                  * different processes.
8633                                  */
8634                                 ret = -EOPNOTSUPP;
8635                                 if (!event->ctx->task)
8636                                         goto fail;
8637
8638                                 /* look up the path and grab its inode */
8639                                 ret = kern_path(filename, LOOKUP_FOLLOW,
8640                                                 &filter->path);
8641                                 if (ret)
8642                                         goto fail;
8643
8644                                 ret = -EINVAL;
8645                                 if (!filter->path.dentry ||
8646                                     !S_ISREG(d_inode(filter->path.dentry)
8647                                              ->i_mode))
8648                                         goto fail;
8649
8650                                 event->addr_filters.nr_file_filters++;
8651                         }
8652
8653                         /* ready to consume more filters */
8654                         kfree(filename);
8655                         filename = NULL;
8656                         state = IF_STATE_ACTION;
8657                         filter = NULL;
8658                         kernel = 0;
8659                 }
8660         }
8661
8662         if (state != IF_STATE_ACTION)
8663                 goto fail;
8664
8665         kfree(filename);
8666         kfree(orig);
8667
8668         return 0;
8669
8670 fail:
8671         kfree(filename);
8672         free_filters_list(filters);
8673         kfree(orig);
8674
8675         return ret;
8676 }
8677
8678 static int
8679 perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
8680 {
8681         LIST_HEAD(filters);
8682         int ret;
8683
8684         /*
8685          * Since this is called in perf_ioctl() path, we're already holding
8686          * ctx::mutex.
8687          */
8688         lockdep_assert_held(&event->ctx->mutex);
8689
8690         if (WARN_ON_ONCE(event->parent))
8691                 return -EINVAL;
8692
8693         ret = perf_event_parse_addr_filter(event, filter_str, &filters);
8694         if (ret)
8695                 goto fail_clear_files;
8696
8697         ret = event->pmu->addr_filters_validate(&filters);
8698         if (ret)
8699                 goto fail_free_filters;
8700
8701         /* remove existing filters, if any */
8702         perf_addr_filters_splice(event, &filters);
8703
8704         /* install new filters */
8705         perf_event_for_each_child(event, perf_event_addr_filters_apply);
8706
8707         return ret;
8708
8709 fail_free_filters:
8710         free_filters_list(&filters);
8711
8712 fail_clear_files:
8713         event->addr_filters.nr_file_filters = 0;
8714
8715         return ret;
8716 }
8717
8718 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
8719 {
8720         char *filter_str;
8721         int ret = -EINVAL;
8722
8723         if ((event->attr.type != PERF_TYPE_TRACEPOINT ||
8724             !IS_ENABLED(CONFIG_EVENT_TRACING)) &&
8725             !has_addr_filter(event))
8726                 return -EINVAL;
8727
8728         filter_str = strndup_user(arg, PAGE_SIZE);
8729         if (IS_ERR(filter_str))
8730                 return PTR_ERR(filter_str);
8731
8732         if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
8733             event->attr.type == PERF_TYPE_TRACEPOINT)
8734                 ret = ftrace_profile_set_filter(event, event->attr.config,
8735                                                 filter_str);
8736         else if (has_addr_filter(event))
8737                 ret = perf_event_set_addr_filter(event, filter_str);
8738
8739         kfree(filter_str);
8740         return ret;
8741 }
8742
8743 /*
8744  * hrtimer based swevent callback
8745  */
8746
8747 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
8748 {
8749         enum hrtimer_restart ret = HRTIMER_RESTART;
8750         struct perf_sample_data data;
8751         struct pt_regs *regs;
8752         struct perf_event *event;
8753         u64 period;
8754
8755         event = container_of(hrtimer, struct perf_event, hw.hrtimer);
8756
8757         if (event->state != PERF_EVENT_STATE_ACTIVE)
8758                 return HRTIMER_NORESTART;
8759
8760         event->pmu->read(event);
8761
8762         perf_sample_data_init(&data, 0, event->hw.last_period);
8763         regs = get_irq_regs();
8764
8765         if (regs && !perf_exclude_event(event, regs)) {
8766                 if (!(event->attr.exclude_idle && is_idle_task(current)))
8767                         if (__perf_event_overflow(event, 1, &data, regs))
8768                                 ret = HRTIMER_NORESTART;
8769         }
8770
8771         period = max_t(u64, 10000, event->hw.sample_period);
8772         hrtimer_forward_now(hrtimer, ns_to_ktime(period));
8773
8774         return ret;
8775 }
8776
8777 static void perf_swevent_start_hrtimer(struct perf_event *event)
8778 {
8779         struct hw_perf_event *hwc = &event->hw;
8780         s64 period;
8781
8782         if (!is_sampling_event(event))
8783                 return;
8784
8785         period = local64_read(&hwc->period_left);
8786         if (period) {
8787                 if (period < 0)
8788                         period = 10000;
8789
8790                 local64_set(&hwc->period_left, 0);
8791         } else {
8792                 period = max_t(u64, 10000, hwc->sample_period);
8793         }
8794         hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
8795                       HRTIMER_MODE_REL_PINNED);
8796 }
8797
8798 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
8799 {
8800         struct hw_perf_event *hwc = &event->hw;
8801
8802         if (is_sampling_event(event)) {
8803                 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
8804                 local64_set(&hwc->period_left, ktime_to_ns(remaining));
8805
8806                 hrtimer_cancel(&hwc->hrtimer);
8807         }
8808 }
8809
8810 static void perf_swevent_init_hrtimer(struct perf_event *event)
8811 {
8812         struct hw_perf_event *hwc = &event->hw;
8813
8814         if (!is_sampling_event(event))
8815                 return;
8816
8817         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
8818         hwc->hrtimer.function = perf_swevent_hrtimer;
8819
8820         /*
8821          * Since hrtimers have a fixed rate, we can do a static freq->period
8822          * mapping and avoid the whole period adjust feedback stuff.
8823          */
8824         if (event->attr.freq) {
8825                 long freq = event->attr.sample_freq;
8826
8827                 event->attr.sample_period = NSEC_PER_SEC / freq;
8828                 hwc->sample_period = event->attr.sample_period;
8829                 local64_set(&hwc->period_left, hwc->sample_period);
8830                 hwc->last_period = hwc->sample_period;
8831                 event->attr.freq = 0;
8832         }
8833 }
8834
8835 /*
8836  * Software event: cpu wall time clock
8837  */
8838
8839 static void cpu_clock_event_update(struct perf_event *event)
8840 {
8841         s64 prev;
8842         u64 now;
8843
8844         now = local_clock();
8845         prev = local64_xchg(&event->hw.prev_count, now);
8846         local64_add(now - prev, &event->count);
8847 }
8848
8849 static void cpu_clock_event_start(struct perf_event *event, int flags)
8850 {
8851         local64_set(&event->hw.prev_count, local_clock());
8852         perf_swevent_start_hrtimer(event);
8853 }
8854
8855 static void cpu_clock_event_stop(struct perf_event *event, int flags)
8856 {
8857         perf_swevent_cancel_hrtimer(event);
8858         cpu_clock_event_update(event);
8859 }
8860
8861 static int cpu_clock_event_add(struct perf_event *event, int flags)
8862 {
8863         if (flags & PERF_EF_START)
8864                 cpu_clock_event_start(event, flags);
8865         perf_event_update_userpage(event);
8866
8867         return 0;
8868 }
8869
8870 static void cpu_clock_event_del(struct perf_event *event, int flags)
8871 {
8872         cpu_clock_event_stop(event, flags);
8873 }
8874
8875 static void cpu_clock_event_read(struct perf_event *event)
8876 {
8877         cpu_clock_event_update(event);
8878 }
8879
8880 static int cpu_clock_event_init(struct perf_event *event)
8881 {
8882         if (event->attr.type != PERF_TYPE_SOFTWARE)
8883                 return -ENOENT;
8884
8885         if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
8886                 return -ENOENT;
8887
8888         /*
8889          * no branch sampling for software events
8890          */
8891         if (has_branch_stack(event))
8892                 return -EOPNOTSUPP;
8893
8894         perf_swevent_init_hrtimer(event);
8895
8896         return 0;
8897 }
8898
8899 static struct pmu perf_cpu_clock = {
8900         .task_ctx_nr    = perf_sw_context,
8901
8902         .capabilities   = PERF_PMU_CAP_NO_NMI,
8903
8904         .event_init     = cpu_clock_event_init,
8905         .add            = cpu_clock_event_add,
8906         .del            = cpu_clock_event_del,
8907         .start          = cpu_clock_event_start,
8908         .stop           = cpu_clock_event_stop,
8909         .read           = cpu_clock_event_read,
8910 };
8911
8912 /*
8913  * Software event: task time clock
8914  */
8915
8916 static void task_clock_event_update(struct perf_event *event, u64 now)
8917 {
8918         u64 prev;
8919         s64 delta;
8920
8921         prev = local64_xchg(&event->hw.prev_count, now);
8922         delta = now - prev;
8923         local64_add(delta, &event->count);
8924 }
8925
8926 static void task_clock_event_start(struct perf_event *event, int flags)
8927 {
8928         local64_set(&event->hw.prev_count, event->ctx->time);
8929         perf_swevent_start_hrtimer(event);
8930 }
8931
8932 static void task_clock_event_stop(struct perf_event *event, int flags)
8933 {
8934         perf_swevent_cancel_hrtimer(event);
8935         task_clock_event_update(event, event->ctx->time);
8936 }
8937
8938 static int task_clock_event_add(struct perf_event *event, int flags)
8939 {
8940         if (flags & PERF_EF_START)
8941                 task_clock_event_start(event, flags);
8942         perf_event_update_userpage(event);
8943
8944         return 0;
8945 }
8946
8947 static void task_clock_event_del(struct perf_event *event, int flags)
8948 {
8949         task_clock_event_stop(event, PERF_EF_UPDATE);
8950 }
8951
8952 static void task_clock_event_read(struct perf_event *event)
8953 {
8954         u64 now = perf_clock();
8955         u64 delta = now - event->ctx->timestamp;
8956         u64 time = event->ctx->time + delta;
8957
8958         task_clock_event_update(event, time);
8959 }
8960
8961 static int task_clock_event_init(struct perf_event *event)
8962 {
8963         if (event->attr.type != PERF_TYPE_SOFTWARE)
8964                 return -ENOENT;
8965
8966         if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
8967                 return -ENOENT;
8968
8969         /*
8970          * no branch sampling for software events
8971          */
8972         if (has_branch_stack(event))
8973                 return -EOPNOTSUPP;
8974
8975         perf_swevent_init_hrtimer(event);
8976
8977         return 0;
8978 }
8979
8980 static struct pmu perf_task_clock = {
8981         .task_ctx_nr    = perf_sw_context,
8982
8983         .capabilities   = PERF_PMU_CAP_NO_NMI,
8984
8985         .event_init     = task_clock_event_init,
8986         .add            = task_clock_event_add,
8987         .del            = task_clock_event_del,
8988         .start          = task_clock_event_start,
8989         .stop           = task_clock_event_stop,
8990         .read           = task_clock_event_read,
8991 };
8992
8993 static void perf_pmu_nop_void(struct pmu *pmu)
8994 {
8995 }
8996
8997 static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
8998 {
8999 }
9000
9001 static int perf_pmu_nop_int(struct pmu *pmu)
9002 {
9003         return 0;
9004 }
9005
9006 static int perf_event_nop_int(struct perf_event *event, u64 value)
9007 {
9008         return 0;
9009 }
9010
9011 static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
9012
9013 static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
9014 {
9015         __this_cpu_write(nop_txn_flags, flags);
9016
9017         if (flags & ~PERF_PMU_TXN_ADD)
9018                 return;
9019
9020         perf_pmu_disable(pmu);
9021 }
9022
9023 static int perf_pmu_commit_txn(struct pmu *pmu)
9024 {
9025         unsigned int flags = __this_cpu_read(nop_txn_flags);
9026
9027         __this_cpu_write(nop_txn_flags, 0);
9028
9029         if (flags & ~PERF_PMU_TXN_ADD)
9030                 return 0;
9031
9032         perf_pmu_enable(pmu);
9033         return 0;
9034 }
9035
9036 static void perf_pmu_cancel_txn(struct pmu *pmu)
9037 {
9038         unsigned int flags =  __this_cpu_read(nop_txn_flags);
9039
9040         __this_cpu_write(nop_txn_flags, 0);
9041
9042         if (flags & ~PERF_PMU_TXN_ADD)
9043                 return;
9044
9045         perf_pmu_enable(pmu);
9046 }
9047
9048 static int perf_event_idx_default(struct perf_event *event)
9049 {
9050         return 0;
9051 }
9052
9053 /*
9054  * Ensures all contexts with the same task_ctx_nr have the same
9055  * pmu_cpu_context too.
9056  */
9057 static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
9058 {
9059         struct pmu *pmu;
9060
9061         if (ctxn < 0)
9062                 return NULL;
9063
9064         list_for_each_entry(pmu, &pmus, entry) {
9065                 if (pmu->task_ctx_nr == ctxn)
9066                         return pmu->pmu_cpu_context;
9067         }
9068
9069         return NULL;
9070 }
9071
9072 static void free_pmu_context(struct pmu *pmu)
9073 {
9074         /*
9075          * Static contexts such as perf_sw_context have a global lifetime
9076          * and may be shared between different PMUs. Avoid freeing them
9077          * when a single PMU is going away.
9078          */
9079         if (pmu->task_ctx_nr > perf_invalid_context)
9080                 return;
9081
9082         free_percpu(pmu->pmu_cpu_context);
9083 }
9084
9085 /*
9086  * Let userspace know that this PMU supports address range filtering:
9087  */
9088 static ssize_t nr_addr_filters_show(struct device *dev,
9089                                     struct device_attribute *attr,
9090                                     char *page)
9091 {
9092         struct pmu *pmu = dev_get_drvdata(dev);
9093
9094         return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
9095 }
9096 DEVICE_ATTR_RO(nr_addr_filters);
9097
9098 static struct idr pmu_idr;
9099
9100 static ssize_t
9101 type_show(struct device *dev, struct device_attribute *attr, char *page)
9102 {
9103         struct pmu *pmu = dev_get_drvdata(dev);
9104
9105         return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
9106 }
9107 static DEVICE_ATTR_RO(type);
9108
9109 static ssize_t
9110 perf_event_mux_interval_ms_show(struct device *dev,
9111                                 struct device_attribute *attr,
9112                                 char *page)
9113 {
9114         struct pmu *pmu = dev_get_drvdata(dev);
9115
9116         return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
9117 }
9118
9119 static DEFINE_MUTEX(mux_interval_mutex);
9120
9121 static ssize_t
9122 perf_event_mux_interval_ms_store(struct device *dev,
9123                                  struct device_attribute *attr,
9124                                  const char *buf, size_t count)
9125 {
9126         struct pmu *pmu = dev_get_drvdata(dev);
9127         int timer, cpu, ret;
9128
9129         ret = kstrtoint(buf, 0, &timer);
9130         if (ret)
9131                 return ret;
9132
9133         if (timer < 1)
9134                 return -EINVAL;
9135
9136         /* same value, noting to do */
9137         if (timer == pmu->hrtimer_interval_ms)
9138                 return count;
9139
9140         mutex_lock(&mux_interval_mutex);
9141         pmu->hrtimer_interval_ms = timer;
9142
9143         /* update all cpuctx for this PMU */
9144         cpus_read_lock();
9145         for_each_online_cpu(cpu) {
9146                 struct perf_cpu_context *cpuctx;
9147                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
9148                 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
9149
9150                 cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpuctx);
9151         }
9152         cpus_read_unlock();
9153         mutex_unlock(&mux_interval_mutex);
9154
9155         return count;
9156 }
9157 static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
9158
9159 static struct attribute *pmu_dev_attrs[] = {
9160         &dev_attr_type.attr,
9161         &dev_attr_perf_event_mux_interval_ms.attr,
9162         NULL,
9163 };
9164 ATTRIBUTE_GROUPS(pmu_dev);
9165
9166 static int pmu_bus_running;
9167 static struct bus_type pmu_bus = {
9168         .name           = "event_source",
9169         .dev_groups     = pmu_dev_groups,
9170 };
9171
9172 static void pmu_dev_release(struct device *dev)
9173 {
9174         kfree(dev);
9175 }
9176
9177 static int pmu_dev_alloc(struct pmu *pmu)
9178 {
9179         int ret = -ENOMEM;
9180
9181         pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
9182         if (!pmu->dev)
9183                 goto out;
9184
9185         pmu->dev->groups = pmu->attr_groups;
9186         device_initialize(pmu->dev);
9187
9188         dev_set_drvdata(pmu->dev, pmu);
9189         pmu->dev->bus = &pmu_bus;
9190         pmu->dev->release = pmu_dev_release;
9191
9192         ret = dev_set_name(pmu->dev, "%s", pmu->name);
9193         if (ret)
9194                 goto free_dev;
9195
9196         ret = device_add(pmu->dev);
9197         if (ret)
9198                 goto free_dev;
9199
9200         /* For PMUs with address filters, throw in an extra attribute: */
9201         if (pmu->nr_addr_filters)
9202                 ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
9203
9204         if (ret)
9205                 goto del_dev;
9206
9207 out:
9208         return ret;
9209
9210 del_dev:
9211         device_del(pmu->dev);
9212
9213 free_dev:
9214         put_device(pmu->dev);
9215         goto out;
9216 }
9217
9218 static struct lock_class_key cpuctx_mutex;
9219 static struct lock_class_key cpuctx_lock;
9220
9221 int perf_pmu_register(struct pmu *pmu, const char *name, int type)
9222 {
9223         int cpu, ret;
9224
9225         mutex_lock(&pmus_lock);
9226         ret = -ENOMEM;
9227         pmu->pmu_disable_count = alloc_percpu(int);
9228         if (!pmu->pmu_disable_count)
9229                 goto unlock;
9230
9231         pmu->type = -1;
9232         if (!name)
9233                 goto skip_type;
9234         pmu->name = name;
9235
9236         if (type < 0) {
9237                 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
9238                 if (type < 0) {
9239                         ret = type;
9240                         goto free_pdc;
9241                 }
9242         }
9243         pmu->type = type;
9244
9245         if (pmu_bus_running) {
9246                 ret = pmu_dev_alloc(pmu);
9247                 if (ret)
9248                         goto free_idr;
9249         }
9250
9251 skip_type:
9252         if (pmu->task_ctx_nr == perf_hw_context) {
9253                 static int hw_context_taken = 0;
9254
9255                 /*
9256                  * Other than systems with heterogeneous CPUs, it never makes
9257                  * sense for two PMUs to share perf_hw_context. PMUs which are
9258                  * uncore must use perf_invalid_context.
9259                  */
9260                 if (WARN_ON_ONCE(hw_context_taken &&
9261                     !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
9262                         pmu->task_ctx_nr = perf_invalid_context;
9263
9264                 hw_context_taken = 1;
9265         }
9266
9267         pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
9268         if (pmu->pmu_cpu_context)
9269                 goto got_cpu_context;
9270
9271         ret = -ENOMEM;
9272         pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
9273         if (!pmu->pmu_cpu_context)
9274                 goto free_dev;
9275
9276         for_each_possible_cpu(cpu) {
9277                 struct perf_cpu_context *cpuctx;
9278
9279                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
9280                 __perf_event_init_context(&cpuctx->ctx);
9281                 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
9282                 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
9283                 cpuctx->ctx.pmu = pmu;
9284                 cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
9285
9286                 __perf_mux_hrtimer_init(cpuctx, cpu);
9287         }
9288
9289 got_cpu_context:
9290         if (!pmu->start_txn) {
9291                 if (pmu->pmu_enable) {
9292                         /*
9293                          * If we have pmu_enable/pmu_disable calls, install
9294                          * transaction stubs that use that to try and batch
9295                          * hardware accesses.
9296                          */
9297                         pmu->start_txn  = perf_pmu_start_txn;
9298                         pmu->commit_txn = perf_pmu_commit_txn;
9299                         pmu->cancel_txn = perf_pmu_cancel_txn;
9300                 } else {
9301                         pmu->start_txn  = perf_pmu_nop_txn;
9302                         pmu->commit_txn = perf_pmu_nop_int;
9303                         pmu->cancel_txn = perf_pmu_nop_void;
9304                 }
9305         }
9306
9307         if (!pmu->pmu_enable) {
9308                 pmu->pmu_enable  = perf_pmu_nop_void;
9309                 pmu->pmu_disable = perf_pmu_nop_void;
9310         }
9311
9312         if (!pmu->check_period)
9313                 pmu->check_period = perf_event_nop_int;
9314
9315         if (!pmu->event_idx)
9316                 pmu->event_idx = perf_event_idx_default;
9317
9318         list_add_rcu(&pmu->entry, &pmus);
9319         atomic_set(&pmu->exclusive_cnt, 0);
9320         ret = 0;
9321 unlock:
9322         mutex_unlock(&pmus_lock);
9323
9324         return ret;
9325
9326 free_dev:
9327         device_del(pmu->dev);
9328         put_device(pmu->dev);
9329
9330 free_idr:
9331         if (pmu->type >= PERF_TYPE_MAX)
9332                 idr_remove(&pmu_idr, pmu->type);
9333
9334 free_pdc:
9335         free_percpu(pmu->pmu_disable_count);
9336         goto unlock;
9337 }
9338 EXPORT_SYMBOL_GPL(perf_pmu_register);
9339
9340 void perf_pmu_unregister(struct pmu *pmu)
9341 {
9342         mutex_lock(&pmus_lock);
9343         list_del_rcu(&pmu->entry);
9344
9345         /*
9346          * We dereference the pmu list under both SRCU and regular RCU, so
9347          * synchronize against both of those.
9348          */
9349         synchronize_srcu(&pmus_srcu);
9350         synchronize_rcu();
9351
9352         free_percpu(pmu->pmu_disable_count);
9353         if (pmu->type >= PERF_TYPE_MAX)
9354                 idr_remove(&pmu_idr, pmu->type);
9355         if (pmu_bus_running) {
9356                 if (pmu->nr_addr_filters)
9357                         device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
9358                 device_del(pmu->dev);
9359                 put_device(pmu->dev);
9360         }
9361         free_pmu_context(pmu);
9362         mutex_unlock(&pmus_lock);
9363 }
9364 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
9365
9366 static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
9367 {
9368         struct perf_event_context *ctx = NULL;
9369         int ret;
9370
9371         if (!try_module_get(pmu->module))
9372                 return -ENODEV;
9373
9374         if (event->group_leader != event) {
9375                 /*
9376                  * This ctx->mutex can nest when we're called through
9377                  * inheritance. See the perf_event_ctx_lock_nested() comment.
9378                  */
9379                 ctx = perf_event_ctx_lock_nested(event->group_leader,
9380                                                  SINGLE_DEPTH_NESTING);
9381                 BUG_ON(!ctx);
9382         }
9383
9384         event->pmu = pmu;
9385         ret = pmu->event_init(event);
9386
9387         if (ctx)
9388                 perf_event_ctx_unlock(event->group_leader, ctx);
9389
9390         if (ret)
9391                 module_put(pmu->module);
9392
9393         return ret;
9394 }
9395
9396 static struct pmu *perf_init_event(struct perf_event *event)
9397 {
9398         struct pmu *pmu;
9399         int idx;
9400         int ret;
9401
9402         idx = srcu_read_lock(&pmus_srcu);
9403
9404         /* Try parent's PMU first: */
9405         if (event->parent && event->parent->pmu) {
9406                 pmu = event->parent->pmu;
9407                 ret = perf_try_init_event(pmu, event);
9408                 if (!ret)
9409                         goto unlock;
9410         }
9411
9412         rcu_read_lock();
9413         pmu = idr_find(&pmu_idr, event->attr.type);
9414         rcu_read_unlock();
9415         if (pmu) {
9416                 ret = perf_try_init_event(pmu, event);
9417                 if (ret)
9418                         pmu = ERR_PTR(ret);
9419                 goto unlock;
9420         }
9421
9422         list_for_each_entry_rcu(pmu, &pmus, entry) {
9423                 ret = perf_try_init_event(pmu, event);
9424                 if (!ret)
9425                         goto unlock;
9426
9427                 if (ret != -ENOENT) {
9428                         pmu = ERR_PTR(ret);
9429                         goto unlock;
9430                 }
9431         }
9432         pmu = ERR_PTR(-ENOENT);
9433 unlock:
9434         srcu_read_unlock(&pmus_srcu, idx);
9435
9436         return pmu;
9437 }
9438
9439 static void attach_sb_event(struct perf_event *event)
9440 {
9441         struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
9442
9443         raw_spin_lock(&pel->lock);
9444         list_add_rcu(&event->sb_list, &pel->list);
9445         raw_spin_unlock(&pel->lock);
9446 }
9447
9448 /*
9449  * We keep a list of all !task (and therefore per-cpu) events
9450  * that need to receive side-band records.
9451  *
9452  * This avoids having to scan all the various PMU per-cpu contexts
9453  * looking for them.
9454  */
9455 static void account_pmu_sb_event(struct perf_event *event)
9456 {
9457         if (is_sb_event(event))
9458                 attach_sb_event(event);
9459 }
9460
9461 static void account_event_cpu(struct perf_event *event, int cpu)
9462 {
9463         if (event->parent)
9464                 return;
9465
9466         if (is_cgroup_event(event))
9467                 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
9468 }
9469
9470 /* Freq events need the tick to stay alive (see perf_event_task_tick). */
9471 static void account_freq_event_nohz(void)
9472 {
9473 #ifdef CONFIG_NO_HZ_FULL
9474         /* Lock so we don't race with concurrent unaccount */
9475         spin_lock(&nr_freq_lock);
9476         if (atomic_inc_return(&nr_freq_events) == 1)
9477                 tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
9478         spin_unlock(&nr_freq_lock);
9479 #endif
9480 }
9481
9482 static void account_freq_event(void)
9483 {
9484         if (tick_nohz_full_enabled())
9485                 account_freq_event_nohz();
9486         else
9487                 atomic_inc(&nr_freq_events);
9488 }
9489
9490
9491 static void account_event(struct perf_event *event)
9492 {
9493         bool inc = false;
9494
9495         if (event->parent)
9496                 return;
9497
9498         if (event->attach_state & PERF_ATTACH_TASK)
9499                 inc = true;
9500         if (event->attr.mmap || event->attr.mmap_data)
9501                 atomic_inc(&nr_mmap_events);
9502         if (event->attr.comm)
9503                 atomic_inc(&nr_comm_events);
9504         if (event->attr.namespaces)
9505                 atomic_inc(&nr_namespaces_events);
9506         if (event->attr.task)
9507                 atomic_inc(&nr_task_events);
9508         if (event->attr.freq)
9509                 account_freq_event();
9510         if (event->attr.context_switch) {
9511                 atomic_inc(&nr_switch_events);
9512                 inc = true;
9513         }
9514         if (has_branch_stack(event))
9515                 inc = true;
9516         if (is_cgroup_event(event))
9517                 inc = true;
9518
9519         if (inc) {
9520                 if (atomic_inc_not_zero(&perf_sched_count))
9521                         goto enabled;
9522
9523                 mutex_lock(&perf_sched_mutex);
9524                 if (!atomic_read(&perf_sched_count)) {
9525                         static_branch_enable(&perf_sched_events);
9526                         /*
9527                          * Guarantee that all CPUs observe they key change and
9528                          * call the perf scheduling hooks before proceeding to
9529                          * install events that need them.
9530                          */
9531                         synchronize_sched();
9532                 }
9533                 /*
9534                  * Now that we have waited for the sync_sched(), allow further
9535                  * increments to by-pass the mutex.
9536                  */
9537                 atomic_inc(&perf_sched_count);
9538                 mutex_unlock(&perf_sched_mutex);
9539         }
9540 enabled:
9541
9542         account_event_cpu(event, event->cpu);
9543
9544         account_pmu_sb_event(event);
9545 }
9546
9547 /*
9548  * Allocate and initialize a event structure
9549  */
9550 static struct perf_event *
9551 perf_event_alloc(struct perf_event_attr *attr, int cpu,
9552                  struct task_struct *task,
9553                  struct perf_event *group_leader,
9554                  struct perf_event *parent_event,
9555                  perf_overflow_handler_t overflow_handler,
9556                  void *context, int cgroup_fd)
9557 {
9558         struct pmu *pmu;
9559         struct perf_event *event;
9560         struct hw_perf_event *hwc;
9561         long err = -EINVAL;
9562
9563         if ((unsigned)cpu >= nr_cpu_ids) {
9564                 if (!task || cpu != -1)
9565                         return ERR_PTR(-EINVAL);
9566         }
9567
9568         event = kzalloc(sizeof(*event), GFP_KERNEL);
9569         if (!event)
9570                 return ERR_PTR(-ENOMEM);
9571
9572         /*
9573          * Single events are their own group leaders, with an
9574          * empty sibling list:
9575          */
9576         if (!group_leader)
9577                 group_leader = event;
9578
9579         mutex_init(&event->child_mutex);
9580         INIT_LIST_HEAD(&event->child_list);
9581
9582         INIT_LIST_HEAD(&event->group_entry);
9583         INIT_LIST_HEAD(&event->event_entry);
9584         INIT_LIST_HEAD(&event->sibling_list);
9585         INIT_LIST_HEAD(&event->rb_entry);
9586         INIT_LIST_HEAD(&event->active_entry);
9587         INIT_LIST_HEAD(&event->addr_filters.list);
9588         INIT_HLIST_NODE(&event->hlist_entry);
9589
9590
9591         init_waitqueue_head(&event->waitq);
9592         init_irq_work(&event->pending, perf_pending_event);
9593
9594         mutex_init(&event->mmap_mutex);
9595         raw_spin_lock_init(&event->addr_filters.lock);
9596
9597         atomic_long_set(&event->refcount, 1);
9598         event->cpu              = cpu;
9599         event->attr             = *attr;
9600         event->group_leader     = group_leader;
9601         event->pmu              = NULL;
9602         event->oncpu            = -1;
9603
9604         event->parent           = parent_event;
9605
9606         event->ns               = get_pid_ns(task_active_pid_ns(current));
9607         event->id               = atomic64_inc_return(&perf_event_id);
9608
9609         event->state            = PERF_EVENT_STATE_INACTIVE;
9610
9611         if (task) {
9612                 event->attach_state = PERF_ATTACH_TASK;
9613                 /*
9614                  * XXX pmu::event_init needs to know what task to account to
9615                  * and we cannot use the ctx information because we need the
9616                  * pmu before we get a ctx.
9617                  */
9618                 get_task_struct(task);
9619                 event->hw.target = task;
9620         }
9621
9622         event->clock = &local_clock;
9623         if (parent_event)
9624                 event->clock = parent_event->clock;
9625
9626         if (!overflow_handler && parent_event) {
9627                 overflow_handler = parent_event->overflow_handler;
9628                 context = parent_event->overflow_handler_context;
9629 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
9630                 if (overflow_handler == bpf_overflow_handler) {
9631                         struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
9632
9633                         if (IS_ERR(prog)) {
9634                                 err = PTR_ERR(prog);
9635                                 goto err_ns;
9636                         }
9637                         event->prog = prog;
9638                         event->orig_overflow_handler =
9639                                 parent_event->orig_overflow_handler;
9640                 }
9641 #endif
9642         }
9643
9644         if (overflow_handler) {
9645                 event->overflow_handler = overflow_handler;
9646                 event->overflow_handler_context = context;
9647         } else if (is_write_backward(event)){
9648                 event->overflow_handler = perf_event_output_backward;
9649                 event->overflow_handler_context = NULL;
9650         } else {
9651                 event->overflow_handler = perf_event_output_forward;
9652                 event->overflow_handler_context = NULL;
9653         }
9654
9655         perf_event__state_init(event);
9656
9657         pmu = NULL;
9658
9659         hwc = &event->hw;
9660         hwc->sample_period = attr->sample_period;
9661         if (attr->freq && attr->sample_freq)
9662                 hwc->sample_period = 1;
9663         hwc->last_period = hwc->sample_period;
9664
9665         local64_set(&hwc->period_left, hwc->sample_period);
9666
9667         /*
9668          * We currently do not support PERF_SAMPLE_READ on inherited events.
9669          * See perf_output_read().
9670          */
9671         if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
9672                 goto err_ns;
9673
9674         if (!has_branch_stack(event))
9675                 event->attr.branch_sample_type = 0;
9676
9677         if (cgroup_fd != -1) {
9678                 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
9679                 if (err)
9680                         goto err_ns;
9681         }
9682
9683         pmu = perf_init_event(event);
9684         if (IS_ERR(pmu)) {
9685                 err = PTR_ERR(pmu);
9686                 goto err_ns;
9687         }
9688
9689         err = exclusive_event_init(event);
9690         if (err)
9691                 goto err_pmu;
9692
9693         if (has_addr_filter(event)) {
9694                 event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
9695                                                    sizeof(unsigned long),
9696                                                    GFP_KERNEL);
9697                 if (!event->addr_filters_offs) {
9698                         err = -ENOMEM;
9699                         goto err_per_task;
9700                 }
9701
9702                 /* force hw sync on the address filters */
9703                 event->addr_filters_gen = 1;
9704         }
9705
9706         if (!event->parent) {
9707                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
9708                         err = get_callchain_buffers(attr->sample_max_stack);
9709                         if (err)
9710                                 goto err_addr_filters;
9711                 }
9712         }
9713
9714         /* symmetric to unaccount_event() in _free_event() */
9715         account_event(event);
9716
9717         return event;
9718
9719 err_addr_filters:
9720         kfree(event->addr_filters_offs);
9721
9722 err_per_task:
9723         exclusive_event_destroy(event);
9724
9725 err_pmu:
9726         if (event->destroy)
9727                 event->destroy(event);
9728         module_put(pmu->module);
9729 err_ns:
9730         if (is_cgroup_event(event))
9731                 perf_detach_cgroup(event);
9732         if (event->ns)
9733                 put_pid_ns(event->ns);
9734         if (event->hw.target)
9735                 put_task_struct(event->hw.target);
9736         kfree(event);
9737
9738         return ERR_PTR(err);
9739 }
9740
9741 static int perf_copy_attr(struct perf_event_attr __user *uattr,
9742                           struct perf_event_attr *attr)
9743 {
9744         u32 size;
9745         int ret;
9746
9747         if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
9748                 return -EFAULT;
9749
9750         /*
9751          * zero the full structure, so that a short copy will be nice.
9752          */
9753         memset(attr, 0, sizeof(*attr));
9754
9755         ret = get_user(size, &uattr->size);
9756         if (ret)
9757                 return ret;
9758
9759         if (size > PAGE_SIZE)   /* silly large */
9760                 goto err_size;
9761
9762         if (!size)              /* abi compat */
9763                 size = PERF_ATTR_SIZE_VER0;
9764
9765         if (size < PERF_ATTR_SIZE_VER0)
9766                 goto err_size;
9767
9768         /*
9769          * If we're handed a bigger struct than we know of,
9770          * ensure all the unknown bits are 0 - i.e. new
9771          * user-space does not rely on any kernel feature
9772          * extensions we dont know about yet.
9773          */
9774         if (size > sizeof(*attr)) {
9775                 unsigned char __user *addr;
9776                 unsigned char __user *end;
9777                 unsigned char val;
9778
9779                 addr = (void __user *)uattr + sizeof(*attr);
9780                 end  = (void __user *)uattr + size;
9781
9782                 for (; addr < end; addr++) {
9783                         ret = get_user(val, addr);
9784                         if (ret)
9785                                 return ret;
9786                         if (val)
9787                                 goto err_size;
9788                 }
9789                 size = sizeof(*attr);
9790         }
9791
9792         ret = copy_from_user(attr, uattr, size);
9793         if (ret)
9794                 return -EFAULT;
9795
9796         attr->size = size;
9797
9798         if (attr->__reserved_1)
9799                 return -EINVAL;
9800
9801         if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
9802                 return -EINVAL;
9803
9804         if (attr->read_format & ~(PERF_FORMAT_MAX-1))
9805                 return -EINVAL;
9806
9807         if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
9808                 u64 mask = attr->branch_sample_type;
9809
9810                 /* only using defined bits */
9811                 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
9812                         return -EINVAL;
9813
9814                 /* at least one branch bit must be set */
9815                 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
9816                         return -EINVAL;
9817
9818                 /* propagate priv level, when not set for branch */
9819                 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
9820
9821                         /* exclude_kernel checked on syscall entry */
9822                         if (!attr->exclude_kernel)
9823                                 mask |= PERF_SAMPLE_BRANCH_KERNEL;
9824
9825                         if (!attr->exclude_user)
9826                                 mask |= PERF_SAMPLE_BRANCH_USER;
9827
9828                         if (!attr->exclude_hv)
9829                                 mask |= PERF_SAMPLE_BRANCH_HV;
9830                         /*
9831                          * adjust user setting (for HW filter setup)
9832                          */
9833                         attr->branch_sample_type = mask;
9834                 }
9835                 /* privileged levels capture (kernel, hv): check permissions */
9836                 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
9837                     && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
9838                         return -EACCES;
9839         }
9840
9841         if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
9842                 ret = perf_reg_validate(attr->sample_regs_user);
9843                 if (ret)
9844                         return ret;
9845         }
9846
9847         if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
9848                 if (!arch_perf_have_user_stack_dump())
9849                         return -ENOSYS;
9850
9851                 /*
9852                  * We have __u32 type for the size, but so far
9853                  * we can only use __u16 as maximum due to the
9854                  * __u16 sample size limit.
9855                  */
9856                 if (attr->sample_stack_user >= USHRT_MAX)
9857                         return -EINVAL;
9858                 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
9859                         return -EINVAL;
9860         }
9861
9862         if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
9863                 ret = perf_reg_validate(attr->sample_regs_intr);
9864 out:
9865         return ret;
9866
9867 err_size:
9868         put_user(sizeof(*attr), &uattr->size);
9869         ret = -E2BIG;
9870         goto out;
9871 }
9872
9873 static void mutex_lock_double(struct mutex *a, struct mutex *b)
9874 {
9875         if (b < a)
9876                 swap(a, b);
9877
9878         mutex_lock(a);
9879         mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
9880 }
9881
9882 static int
9883 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
9884 {
9885         struct ring_buffer *rb = NULL;
9886         int ret = -EINVAL;
9887
9888         if (!output_event) {
9889                 mutex_lock(&event->mmap_mutex);
9890                 goto set;
9891         }
9892
9893         /* don't allow circular references */
9894         if (event == output_event)
9895                 goto out;
9896
9897         /*
9898          * Don't allow cross-cpu buffers
9899          */
9900         if (output_event->cpu != event->cpu)
9901                 goto out;
9902
9903         /*
9904          * If its not a per-cpu rb, it must be the same task.
9905          */
9906         if (output_event->cpu == -1 && output_event->hw.target != event->hw.target)
9907                 goto out;
9908
9909         /*
9910          * Mixing clocks in the same buffer is trouble you don't need.
9911          */
9912         if (output_event->clock != event->clock)
9913                 goto out;
9914
9915         /*
9916          * Either writing ring buffer from beginning or from end.
9917          * Mixing is not allowed.
9918          */
9919         if (is_write_backward(output_event) != is_write_backward(event))
9920                 goto out;
9921
9922         /*
9923          * If both events generate aux data, they must be on the same PMU
9924          */
9925         if (has_aux(event) && has_aux(output_event) &&
9926             event->pmu != output_event->pmu)
9927                 goto out;
9928
9929         /*
9930          * Hold both mmap_mutex to serialize against perf_mmap_close().  Since
9931          * output_event is already on rb->event_list, and the list iteration
9932          * restarts after every removal, it is guaranteed this new event is
9933          * observed *OR* if output_event is already removed, it's guaranteed we
9934          * observe !rb->mmap_count.
9935          */
9936         mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
9937 set:
9938         /* Can't redirect output if we've got an active mmap() */
9939         if (atomic_read(&event->mmap_count))
9940                 goto unlock;
9941
9942         if (output_event) {
9943                 /* get the rb we want to redirect to */
9944                 rb = ring_buffer_get(output_event);
9945                 if (!rb)
9946                         goto unlock;
9947
9948                 /* did we race against perf_mmap_close() */
9949                 if (!atomic_read(&rb->mmap_count)) {
9950                         ring_buffer_put(rb);
9951                         goto unlock;
9952                 }
9953         }
9954
9955         ring_buffer_attach(event, rb);
9956
9957         ret = 0;
9958 unlock:
9959         mutex_unlock(&event->mmap_mutex);
9960         if (output_event)
9961                 mutex_unlock(&output_event->mmap_mutex);
9962
9963 out:
9964         return ret;
9965 }
9966
9967 static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
9968 {
9969         bool nmi_safe = false;
9970
9971         switch (clk_id) {
9972         case CLOCK_MONOTONIC:
9973                 event->clock = &ktime_get_mono_fast_ns;
9974                 nmi_safe = true;
9975                 break;
9976
9977         case CLOCK_MONOTONIC_RAW:
9978                 event->clock = &ktime_get_raw_fast_ns;
9979                 nmi_safe = true;
9980                 break;
9981
9982         case CLOCK_REALTIME:
9983                 event->clock = &ktime_get_real_ns;
9984                 break;
9985
9986         case CLOCK_BOOTTIME:
9987                 event->clock = &ktime_get_boot_ns;
9988                 break;
9989
9990         case CLOCK_TAI:
9991                 event->clock = &ktime_get_tai_ns;
9992                 break;
9993
9994         default:
9995                 return -EINVAL;
9996         }
9997
9998         if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
9999                 return -EINVAL;
10000
10001         return 0;
10002 }
10003
10004 /*
10005  * Variation on perf_event_ctx_lock_nested(), except we take two context
10006  * mutexes.
10007  */
10008 static struct perf_event_context *
10009 __perf_event_ctx_lock_double(struct perf_event *group_leader,
10010                              struct perf_event_context *ctx)
10011 {
10012         struct perf_event_context *gctx;
10013
10014 again:
10015         rcu_read_lock();
10016         gctx = READ_ONCE(group_leader->ctx);
10017         if (!atomic_inc_not_zero(&gctx->refcount)) {
10018                 rcu_read_unlock();
10019                 goto again;
10020         }
10021         rcu_read_unlock();
10022
10023         mutex_lock_double(&gctx->mutex, &ctx->mutex);
10024
10025         if (group_leader->ctx != gctx) {
10026                 mutex_unlock(&ctx->mutex);
10027                 mutex_unlock(&gctx->mutex);
10028                 put_ctx(gctx);
10029                 goto again;
10030         }
10031
10032         return gctx;
10033 }
10034
10035 /**
10036  * sys_perf_event_open - open a performance event, associate it to a task/cpu
10037  *
10038  * @attr_uptr:  event_id type attributes for monitoring/sampling
10039  * @pid:                target pid
10040  * @cpu:                target cpu
10041  * @group_fd:           group leader event fd
10042  */
10043 SYSCALL_DEFINE5(perf_event_open,
10044                 struct perf_event_attr __user *, attr_uptr,
10045                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
10046 {
10047         struct perf_event *group_leader = NULL, *output_event = NULL;
10048         struct perf_event *event, *sibling;
10049         struct perf_event_attr attr;
10050         struct perf_event_context *ctx, *gctx;
10051         struct file *event_file = NULL;
10052         struct fd group = {NULL, 0};
10053         struct task_struct *task = NULL;
10054         struct pmu *pmu;
10055         int event_fd;
10056         int move_group = 0;
10057         int err;
10058         int f_flags = O_RDWR;
10059         int cgroup_fd = -1;
10060
10061         /* for future expandability... */
10062         if (flags & ~PERF_FLAG_ALL)
10063                 return -EINVAL;
10064
10065         err = perf_copy_attr(attr_uptr, &attr);
10066         if (err)
10067                 return err;
10068
10069         if (!attr.exclude_kernel) {
10070                 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
10071                         return -EACCES;
10072         }
10073
10074         if (attr.namespaces) {
10075                 if (!capable(CAP_SYS_ADMIN))
10076                         return -EACCES;
10077         }
10078
10079         if (attr.freq) {
10080                 if (attr.sample_freq > sysctl_perf_event_sample_rate)
10081                         return -EINVAL;
10082         } else {
10083                 if (attr.sample_period & (1ULL << 63))
10084                         return -EINVAL;
10085         }
10086
10087         /* Only privileged users can get physical addresses */
10088         if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
10089             perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
10090                 return -EACCES;
10091
10092         if (!attr.sample_max_stack)
10093                 attr.sample_max_stack = sysctl_perf_event_max_stack;
10094
10095         /*
10096          * In cgroup mode, the pid argument is used to pass the fd
10097          * opened to the cgroup directory in cgroupfs. The cpu argument
10098          * designates the cpu on which to monitor threads from that
10099          * cgroup.
10100          */
10101         if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
10102                 return -EINVAL;
10103
10104         if (flags & PERF_FLAG_FD_CLOEXEC)
10105                 f_flags |= O_CLOEXEC;
10106
10107         event_fd = get_unused_fd_flags(f_flags);
10108         if (event_fd < 0)
10109                 return event_fd;
10110
10111         if (group_fd != -1) {
10112                 err = perf_fget_light(group_fd, &group);
10113                 if (err)
10114                         goto err_fd;
10115                 group_leader = group.file->private_data;
10116                 if (flags & PERF_FLAG_FD_OUTPUT)
10117                         output_event = group_leader;
10118                 if (flags & PERF_FLAG_FD_NO_GROUP)
10119                         group_leader = NULL;
10120         }
10121
10122         if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
10123                 task = find_lively_task_by_vpid(pid);
10124                 if (IS_ERR(task)) {
10125                         err = PTR_ERR(task);
10126                         goto err_group_fd;
10127                 }
10128         }
10129
10130         if (task && group_leader &&
10131             group_leader->attr.inherit != attr.inherit) {
10132                 err = -EINVAL;
10133                 goto err_task;
10134         }
10135
10136         if (task) {
10137                 err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
10138                 if (err)
10139                         goto err_task;
10140
10141                 /*
10142                  * Reuse ptrace permission checks for now.
10143                  *
10144                  * We must hold cred_guard_mutex across this and any potential
10145                  * perf_install_in_context() call for this new event to
10146                  * serialize against exec() altering our credentials (and the
10147                  * perf_event_exit_task() that could imply).
10148                  */
10149                 err = -EACCES;
10150                 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
10151                         goto err_cred;
10152         }
10153
10154         if (flags & PERF_FLAG_PID_CGROUP)
10155                 cgroup_fd = pid;
10156
10157         event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
10158                                  NULL, NULL, cgroup_fd);
10159         if (IS_ERR(event)) {
10160                 err = PTR_ERR(event);
10161                 goto err_cred;
10162         }
10163
10164         if (is_sampling_event(event)) {
10165                 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
10166                         err = -EOPNOTSUPP;
10167                         goto err_alloc;
10168                 }
10169         }
10170
10171         /*
10172          * Special case software events and allow them to be part of
10173          * any hardware group.
10174          */
10175         pmu = event->pmu;
10176
10177         if (attr.use_clockid) {
10178                 err = perf_event_set_clock(event, attr.clockid);
10179                 if (err)
10180                         goto err_alloc;
10181         }
10182
10183         if (pmu->task_ctx_nr == perf_sw_context)
10184                 event->event_caps |= PERF_EV_CAP_SOFTWARE;
10185
10186         if (group_leader &&
10187             (is_software_event(event) != is_software_event(group_leader))) {
10188                 if (is_software_event(event)) {
10189                         /*
10190                          * If event and group_leader are not both a software
10191                          * event, and event is, then group leader is not.
10192                          *
10193                          * Allow the addition of software events to !software
10194                          * groups, this is safe because software events never
10195                          * fail to schedule.
10196                          */
10197                         pmu = group_leader->pmu;
10198                 } else if (is_software_event(group_leader) &&
10199                            (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
10200                         /*
10201                          * In case the group is a pure software group, and we
10202                          * try to add a hardware event, move the whole group to
10203                          * the hardware context.
10204                          */
10205                         move_group = 1;
10206                 }
10207         }
10208
10209         /*
10210          * Get the target context (task or percpu):
10211          */
10212         ctx = find_get_context(pmu, task, event);
10213         if (IS_ERR(ctx)) {
10214                 err = PTR_ERR(ctx);
10215                 goto err_alloc;
10216         }
10217
10218         if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
10219                 err = -EBUSY;
10220                 goto err_context;
10221         }
10222
10223         /*
10224          * Look up the group leader (we will attach this event to it):
10225          */
10226         if (group_leader) {
10227                 err = -EINVAL;
10228
10229                 /*
10230                  * Do not allow a recursive hierarchy (this new sibling
10231                  * becoming part of another group-sibling):
10232                  */
10233                 if (group_leader->group_leader != group_leader)
10234                         goto err_context;
10235
10236                 /* All events in a group should have the same clock */
10237                 if (group_leader->clock != event->clock)
10238                         goto err_context;
10239
10240                 /*
10241                  * Make sure we're both events for the same CPU;
10242                  * grouping events for different CPUs is broken; since
10243                  * you can never concurrently schedule them anyhow.
10244                  */
10245                 if (group_leader->cpu != event->cpu)
10246                         goto err_context;
10247
10248                 /*
10249                  * Make sure we're both on the same task, or both
10250                  * per-CPU events.
10251                  */
10252                 if (group_leader->ctx->task != ctx->task)
10253                         goto err_context;
10254
10255                 /*
10256                  * Do not allow to attach to a group in a different task
10257                  * or CPU context. If we're moving SW events, we'll fix
10258                  * this up later, so allow that.
10259                  *
10260                  * Racy, not holding group_leader->ctx->mutex, see comment with
10261                  * perf_event_ctx_lock().
10262                  */
10263                 if (!move_group && group_leader->ctx != ctx)
10264                         goto err_context;
10265
10266                 /*
10267                  * Only a group leader can be exclusive or pinned
10268                  */
10269                 if (attr.exclusive || attr.pinned)
10270                         goto err_context;
10271         }
10272
10273         if (output_event) {
10274                 err = perf_event_set_output(event, output_event);
10275                 if (err)
10276                         goto err_context;
10277         }
10278
10279         event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
10280                                         f_flags);
10281         if (IS_ERR(event_file)) {
10282                 err = PTR_ERR(event_file);
10283                 event_file = NULL;
10284                 goto err_context;
10285         }
10286
10287         if (move_group) {
10288                 gctx = __perf_event_ctx_lock_double(group_leader, ctx);
10289
10290                 if (gctx->task == TASK_TOMBSTONE) {
10291                         err = -ESRCH;
10292                         goto err_locked;
10293                 }
10294
10295                 /*
10296                  * Check if we raced against another sys_perf_event_open() call
10297                  * moving the software group underneath us.
10298                  */
10299                 if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
10300                         /*
10301                          * If someone moved the group out from under us, check
10302                          * if this new event wound up on the same ctx, if so
10303                          * its the regular !move_group case, otherwise fail.
10304                          */
10305                         if (gctx != ctx) {
10306                                 err = -EINVAL;
10307                                 goto err_locked;
10308                         } else {
10309                                 perf_event_ctx_unlock(group_leader, gctx);
10310                                 move_group = 0;
10311                                 goto not_move_group;
10312                         }
10313                 }
10314         } else {
10315                 mutex_lock(&ctx->mutex);
10316
10317                 /*
10318                  * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx,
10319                  * see the group_leader && !move_group test earlier.
10320                  */
10321                 if (group_leader && group_leader->ctx != ctx) {
10322                         err = -EINVAL;
10323                         goto err_locked;
10324                 }
10325         }
10326 not_move_group:
10327
10328         if (ctx->task == TASK_TOMBSTONE) {
10329                 err = -ESRCH;
10330                 goto err_locked;
10331         }
10332
10333         if (!perf_event_validate_size(event)) {
10334                 err = -E2BIG;
10335                 goto err_locked;
10336         }
10337
10338         if (!task) {
10339                 /*
10340                  * Check if the @cpu we're creating an event for is online.
10341                  *
10342                  * We use the perf_cpu_context::ctx::mutex to serialize against
10343                  * the hotplug notifiers. See perf_event_{init,exit}_cpu().
10344                  */
10345                 struct perf_cpu_context *cpuctx =
10346                         container_of(ctx, struct perf_cpu_context, ctx);
10347
10348                 if (!cpuctx->online) {
10349                         err = -ENODEV;
10350                         goto err_locked;
10351                 }
10352         }
10353
10354
10355         /*
10356          * Must be under the same ctx::mutex as perf_install_in_context(),
10357          * because we need to serialize with concurrent event creation.
10358          */
10359         if (!exclusive_event_installable(event, ctx)) {
10360                 /* exclusive and group stuff are assumed mutually exclusive */
10361                 WARN_ON_ONCE(move_group);
10362
10363                 err = -EBUSY;
10364                 goto err_locked;
10365         }
10366
10367         WARN_ON_ONCE(ctx->parent_ctx);
10368
10369         /*
10370          * This is the point on no return; we cannot fail hereafter. This is
10371          * where we start modifying current state.
10372          */
10373
10374         if (move_group) {
10375                 /*
10376                  * See perf_event_ctx_lock() for comments on the details
10377                  * of swizzling perf_event::ctx.
10378                  */
10379                 perf_remove_from_context(group_leader, 0);
10380                 put_ctx(gctx);
10381
10382                 list_for_each_entry(sibling, &group_leader->sibling_list,
10383                                     group_entry) {
10384                         perf_remove_from_context(sibling, 0);
10385                         put_ctx(gctx);
10386                 }
10387
10388                 /*
10389                  * Wait for everybody to stop referencing the events through
10390                  * the old lists, before installing it on new lists.
10391                  */
10392                 synchronize_rcu();
10393
10394                 /*
10395                  * Install the group siblings before the group leader.
10396                  *
10397                  * Because a group leader will try and install the entire group
10398                  * (through the sibling list, which is still in-tact), we can
10399                  * end up with siblings installed in the wrong context.
10400                  *
10401                  * By installing siblings first we NO-OP because they're not
10402                  * reachable through the group lists.
10403                  */
10404                 list_for_each_entry(sibling, &group_leader->sibling_list,
10405                                     group_entry) {
10406                         perf_event__state_init(sibling);
10407                         perf_install_in_context(ctx, sibling, sibling->cpu);
10408                         get_ctx(ctx);
10409                 }
10410
10411                 /*
10412                  * Removing from the context ends up with disabled
10413                  * event. What we want here is event in the initial
10414                  * startup state, ready to be add into new context.
10415                  */
10416                 perf_event__state_init(group_leader);
10417                 perf_install_in_context(ctx, group_leader, group_leader->cpu);
10418                 get_ctx(ctx);
10419         }
10420
10421         /*
10422          * Precalculate sample_data sizes; do while holding ctx::mutex such
10423          * that we're serialized against further additions and before
10424          * perf_install_in_context() which is the point the event is active and
10425          * can use these values.
10426          */
10427         perf_event__header_size(event);
10428         perf_event__id_header_size(event);
10429
10430         event->owner = current;
10431
10432         perf_install_in_context(ctx, event, event->cpu);
10433         perf_unpin_context(ctx);
10434
10435         if (move_group)
10436                 perf_event_ctx_unlock(group_leader, gctx);
10437         mutex_unlock(&ctx->mutex);
10438
10439         if (task) {
10440                 mutex_unlock(&task->signal->cred_guard_mutex);
10441                 put_task_struct(task);
10442         }
10443
10444         mutex_lock(&current->perf_event_mutex);
10445         list_add_tail(&event->owner_entry, &current->perf_event_list);
10446         mutex_unlock(&current->perf_event_mutex);
10447
10448         /*
10449          * Drop the reference on the group_event after placing the
10450          * new event on the sibling_list. This ensures destruction
10451          * of the group leader will find the pointer to itself in
10452          * perf_group_detach().
10453          */
10454         fdput(group);
10455         fd_install(event_fd, event_file);
10456         return event_fd;
10457
10458 err_locked:
10459         if (move_group)
10460                 perf_event_ctx_unlock(group_leader, gctx);
10461         mutex_unlock(&ctx->mutex);
10462 /* err_file: */
10463         fput(event_file);
10464 err_context:
10465         perf_unpin_context(ctx);
10466         put_ctx(ctx);
10467 err_alloc:
10468         /*
10469          * If event_file is set, the fput() above will have called ->release()
10470          * and that will take care of freeing the event.
10471          */
10472         if (!event_file)
10473                 free_event(event);
10474 err_cred:
10475         if (task)
10476                 mutex_unlock(&task->signal->cred_guard_mutex);
10477 err_task:
10478         if (task)
10479                 put_task_struct(task);
10480 err_group_fd:
10481         fdput(group);
10482 err_fd:
10483         put_unused_fd(event_fd);
10484         return err;
10485 }
10486
10487 /**
10488  * perf_event_create_kernel_counter
10489  *
10490  * @attr: attributes of the counter to create
10491  * @cpu: cpu in which the counter is bound
10492  * @task: task to profile (NULL for percpu)
10493  */
10494 struct perf_event *
10495 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
10496                                  struct task_struct *task,
10497                                  perf_overflow_handler_t overflow_handler,
10498                                  void *context)
10499 {
10500         struct perf_event_context *ctx;
10501         struct perf_event *event;
10502         int err;
10503
10504         /*
10505          * Get the target context (task or percpu):
10506          */
10507
10508         event = perf_event_alloc(attr, cpu, task, NULL, NULL,
10509                                  overflow_handler, context, -1);
10510         if (IS_ERR(event)) {
10511                 err = PTR_ERR(event);
10512                 goto err;
10513         }
10514
10515         /* Mark owner so we could distinguish it from user events. */
10516         event->owner = TASK_TOMBSTONE;
10517
10518         ctx = find_get_context(event->pmu, task, event);
10519         if (IS_ERR(ctx)) {
10520                 err = PTR_ERR(ctx);
10521                 goto err_free;
10522         }
10523
10524         WARN_ON_ONCE(ctx->parent_ctx);
10525         mutex_lock(&ctx->mutex);
10526         if (ctx->task == TASK_TOMBSTONE) {
10527                 err = -ESRCH;
10528                 goto err_unlock;
10529         }
10530
10531         if (!task) {
10532                 /*
10533                  * Check if the @cpu we're creating an event for is online.
10534                  *
10535                  * We use the perf_cpu_context::ctx::mutex to serialize against
10536                  * the hotplug notifiers. See perf_event_{init,exit}_cpu().
10537                  */
10538                 struct perf_cpu_context *cpuctx =
10539                         container_of(ctx, struct perf_cpu_context, ctx);
10540                 if (!cpuctx->online) {
10541                         err = -ENODEV;
10542                         goto err_unlock;
10543                 }
10544         }
10545
10546         if (!exclusive_event_installable(event, ctx)) {
10547                 err = -EBUSY;
10548                 goto err_unlock;
10549         }
10550
10551         perf_install_in_context(ctx, event, event->cpu);
10552         perf_unpin_context(ctx);
10553         mutex_unlock(&ctx->mutex);
10554
10555         return event;
10556
10557 err_unlock:
10558         mutex_unlock(&ctx->mutex);
10559         perf_unpin_context(ctx);
10560         put_ctx(ctx);
10561 err_free:
10562         free_event(event);
10563 err:
10564         return ERR_PTR(err);
10565 }
10566 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
10567
10568 void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
10569 {
10570         struct perf_event_context *src_ctx;
10571         struct perf_event_context *dst_ctx;
10572         struct perf_event *event, *tmp;
10573         LIST_HEAD(events);
10574
10575         src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
10576         dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
10577
10578         /*
10579          * See perf_event_ctx_lock() for comments on the details
10580          * of swizzling perf_event::ctx.
10581          */
10582         mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
10583         list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
10584                                  event_entry) {
10585                 perf_remove_from_context(event, 0);
10586                 unaccount_event_cpu(event, src_cpu);
10587                 put_ctx(src_ctx);
10588                 list_add(&event->migrate_entry, &events);
10589         }
10590
10591         /*
10592          * Wait for the events to quiesce before re-instating them.
10593          */
10594         synchronize_rcu();
10595
10596         /*
10597          * Re-instate events in 2 passes.
10598          *
10599          * Skip over group leaders and only install siblings on this first
10600          * pass, siblings will not get enabled without a leader, however a
10601          * leader will enable its siblings, even if those are still on the old
10602          * context.
10603          */
10604         list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
10605                 if (event->group_leader == event)
10606                         continue;
10607
10608                 list_del(&event->migrate_entry);
10609                 if (event->state >= PERF_EVENT_STATE_OFF)
10610                         event->state = PERF_EVENT_STATE_INACTIVE;
10611                 account_event_cpu(event, dst_cpu);
10612                 perf_install_in_context(dst_ctx, event, dst_cpu);
10613                 get_ctx(dst_ctx);
10614         }
10615
10616         /*
10617          * Once all the siblings are setup properly, install the group leaders
10618          * to make it go.
10619          */
10620         list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
10621                 list_del(&event->migrate_entry);
10622                 if (event->state >= PERF_EVENT_STATE_OFF)
10623                         event->state = PERF_EVENT_STATE_INACTIVE;
10624                 account_event_cpu(event, dst_cpu);
10625                 perf_install_in_context(dst_ctx, event, dst_cpu);
10626                 get_ctx(dst_ctx);
10627         }
10628         mutex_unlock(&dst_ctx->mutex);
10629         mutex_unlock(&src_ctx->mutex);
10630 }
10631 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
10632
10633 static void sync_child_event(struct perf_event *child_event,
10634                                struct task_struct *child)
10635 {
10636         struct perf_event *parent_event = child_event->parent;
10637         u64 child_val;
10638
10639         if (child_event->attr.inherit_stat)
10640                 perf_event_read_event(child_event, child);
10641
10642         child_val = perf_event_count(child_event);
10643
10644         /*
10645          * Add back the child's count to the parent's count:
10646          */
10647         atomic64_add(child_val, &parent_event->child_count);
10648         atomic64_add(child_event->total_time_enabled,
10649                      &parent_event->child_total_time_enabled);
10650         atomic64_add(child_event->total_time_running,
10651                      &parent_event->child_total_time_running);
10652 }
10653
10654 static void
10655 perf_event_exit_event(struct perf_event *child_event,
10656                       struct perf_event_context *child_ctx,
10657                       struct task_struct *child)
10658 {
10659         struct perf_event *parent_event = child_event->parent;
10660
10661         /*
10662          * Do not destroy the 'original' grouping; because of the context
10663          * switch optimization the original events could've ended up in a
10664          * random child task.
10665          *
10666          * If we were to destroy the original group, all group related
10667          * operations would cease to function properly after this random
10668          * child dies.
10669          *
10670          * Do destroy all inherited groups, we don't care about those
10671          * and being thorough is better.
10672          */
10673         raw_spin_lock_irq(&child_ctx->lock);
10674         WARN_ON_ONCE(child_ctx->is_active);
10675
10676         if (parent_event)
10677                 perf_group_detach(child_event);
10678         list_del_event(child_event, child_ctx);
10679         child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
10680         raw_spin_unlock_irq(&child_ctx->lock);
10681
10682         /*
10683          * Parent events are governed by their filedesc, retain them.
10684          */
10685         if (!parent_event) {
10686                 perf_event_wakeup(child_event);
10687                 return;
10688         }
10689         /*
10690          * Child events can be cleaned up.
10691          */
10692
10693         sync_child_event(child_event, child);
10694
10695         /*
10696          * Remove this event from the parent's list
10697          */
10698         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
10699         mutex_lock(&parent_event->child_mutex);
10700         list_del_init(&child_event->child_list);
10701         mutex_unlock(&parent_event->child_mutex);
10702
10703         /*
10704          * Kick perf_poll() for is_event_hup().
10705          */
10706         perf_event_wakeup(parent_event);
10707         free_event(child_event);
10708         put_event(parent_event);
10709 }
10710
10711 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
10712 {
10713         struct perf_event_context *child_ctx, *clone_ctx = NULL;
10714         struct perf_event *child_event, *next;
10715
10716         WARN_ON_ONCE(child != current);
10717
10718         child_ctx = perf_pin_task_context(child, ctxn);
10719         if (!child_ctx)
10720                 return;
10721
10722         /*
10723          * In order to reduce the amount of tricky in ctx tear-down, we hold
10724          * ctx::mutex over the entire thing. This serializes against almost
10725          * everything that wants to access the ctx.
10726          *
10727          * The exception is sys_perf_event_open() /
10728          * perf_event_create_kernel_count() which does find_get_context()
10729          * without ctx::mutex (it cannot because of the move_group double mutex
10730          * lock thing). See the comments in perf_install_in_context().
10731          */
10732         mutex_lock(&child_ctx->mutex);
10733
10734         /*
10735          * In a single ctx::lock section, de-schedule the events and detach the
10736          * context from the task such that we cannot ever get it scheduled back
10737          * in.
10738          */
10739         raw_spin_lock_irq(&child_ctx->lock);
10740         task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
10741
10742         /*
10743          * Now that the context is inactive, destroy the task <-> ctx relation
10744          * and mark the context dead.
10745          */
10746         RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
10747         put_ctx(child_ctx); /* cannot be last */
10748         WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
10749         put_task_struct(current); /* cannot be last */
10750
10751         clone_ctx = unclone_ctx(child_ctx);
10752         raw_spin_unlock_irq(&child_ctx->lock);
10753
10754         if (clone_ctx)
10755                 put_ctx(clone_ctx);
10756
10757         /*
10758          * Report the task dead after unscheduling the events so that we
10759          * won't get any samples after PERF_RECORD_EXIT. We can however still
10760          * get a few PERF_RECORD_READ events.
10761          */
10762         perf_event_task(child, child_ctx, 0);
10763
10764         list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
10765                 perf_event_exit_event(child_event, child_ctx, child);
10766
10767         mutex_unlock(&child_ctx->mutex);
10768
10769         put_ctx(child_ctx);
10770 }
10771
10772 /*
10773  * When a child task exits, feed back event values to parent events.
10774  *
10775  * Can be called with cred_guard_mutex held when called from
10776  * install_exec_creds().
10777  */
10778 void perf_event_exit_task(struct task_struct *child)
10779 {
10780         struct perf_event *event, *tmp;
10781         int ctxn;
10782
10783         mutex_lock(&child->perf_event_mutex);
10784         list_for_each_entry_safe(event, tmp, &child->perf_event_list,
10785                                  owner_entry) {
10786                 list_del_init(&event->owner_entry);
10787
10788                 /*
10789                  * Ensure the list deletion is visible before we clear
10790                  * the owner, closes a race against perf_release() where
10791                  * we need to serialize on the owner->perf_event_mutex.
10792                  */
10793                 smp_store_release(&event->owner, NULL);
10794         }
10795         mutex_unlock(&child->perf_event_mutex);
10796
10797         for_each_task_context_nr(ctxn)
10798                 perf_event_exit_task_context(child, ctxn);
10799
10800         /*
10801          * The perf_event_exit_task_context calls perf_event_task
10802          * with child's task_ctx, which generates EXIT events for
10803          * child contexts and sets child->perf_event_ctxp[] to NULL.
10804          * At this point we need to send EXIT events to cpu contexts.
10805          */
10806         perf_event_task(child, NULL, 0);
10807 }
10808
10809 static void perf_free_event(struct perf_event *event,
10810                             struct perf_event_context *ctx)
10811 {
10812         struct perf_event *parent = event->parent;
10813
10814         if (WARN_ON_ONCE(!parent))
10815                 return;
10816
10817         mutex_lock(&parent->child_mutex);
10818         list_del_init(&event->child_list);
10819         mutex_unlock(&parent->child_mutex);
10820
10821         put_event(parent);
10822
10823         raw_spin_lock_irq(&ctx->lock);
10824         perf_group_detach(event);
10825         list_del_event(event, ctx);
10826         raw_spin_unlock_irq(&ctx->lock);
10827         free_event(event);
10828 }
10829
10830 /*
10831  * Free an unexposed, unused context as created by inheritance by
10832  * perf_event_init_task below, used by fork() in case of fail.
10833  *
10834  * Not all locks are strictly required, but take them anyway to be nice and
10835  * help out with the lockdep assertions.
10836  */
10837 void perf_event_free_task(struct task_struct *task)
10838 {
10839         struct perf_event_context *ctx;
10840         struct perf_event *event, *tmp;
10841         int ctxn;
10842
10843         for_each_task_context_nr(ctxn) {
10844                 ctx = task->perf_event_ctxp[ctxn];
10845                 if (!ctx)
10846                         continue;
10847
10848                 mutex_lock(&ctx->mutex);
10849                 raw_spin_lock_irq(&ctx->lock);
10850                 /*
10851                  * Destroy the task <-> ctx relation and mark the context dead.
10852                  *
10853                  * This is important because even though the task hasn't been
10854                  * exposed yet the context has been (through child_list).
10855                  */
10856                 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
10857                 WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
10858                 put_task_struct(task); /* cannot be last */
10859                 raw_spin_unlock_irq(&ctx->lock);
10860
10861                 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
10862                         perf_free_event(event, ctx);
10863
10864                 mutex_unlock(&ctx->mutex);
10865                 put_ctx(ctx);
10866         }
10867 }
10868
10869 void perf_event_delayed_put(struct task_struct *task)
10870 {
10871         int ctxn;
10872
10873         for_each_task_context_nr(ctxn)
10874                 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
10875 }
10876
10877 struct file *perf_event_get(unsigned int fd)
10878 {
10879         struct file *file;
10880
10881         file = fget_raw(fd);
10882         if (!file)
10883                 return ERR_PTR(-EBADF);
10884
10885         if (file->f_op != &perf_fops) {
10886                 fput(file);
10887                 return ERR_PTR(-EBADF);
10888         }
10889
10890         return file;
10891 }
10892
10893 const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
10894 {
10895         if (!event)
10896                 return ERR_PTR(-EINVAL);
10897
10898         return &event->attr;
10899 }
10900
10901 /*
10902  * Inherit a event from parent task to child task.
10903  *
10904  * Returns:
10905  *  - valid pointer on success
10906  *  - NULL for orphaned events
10907  *  - IS_ERR() on error
10908  */
10909 static struct perf_event *
10910 inherit_event(struct perf_event *parent_event,
10911               struct task_struct *parent,
10912               struct perf_event_context *parent_ctx,
10913               struct task_struct *child,
10914               struct perf_event *group_leader,
10915               struct perf_event_context *child_ctx)
10916 {
10917         enum perf_event_active_state parent_state = parent_event->state;
10918         struct perf_event *child_event;
10919         unsigned long flags;
10920
10921         /*
10922          * Instead of creating recursive hierarchies of events,
10923          * we link inherited events back to the original parent,
10924          * which has a filp for sure, which we use as the reference
10925          * count:
10926          */
10927         if (parent_event->parent)
10928                 parent_event = parent_event->parent;
10929
10930         child_event = perf_event_alloc(&parent_event->attr,
10931                                            parent_event->cpu,
10932                                            child,
10933                                            group_leader, parent_event,
10934                                            NULL, NULL, -1);
10935         if (IS_ERR(child_event))
10936                 return child_event;
10937
10938         /*
10939          * is_orphaned_event() and list_add_tail(&parent_event->child_list)
10940          * must be under the same lock in order to serialize against
10941          * perf_event_release_kernel(), such that either we must observe
10942          * is_orphaned_event() or they will observe us on the child_list.
10943          */
10944         mutex_lock(&parent_event->child_mutex);
10945         if (is_orphaned_event(parent_event) ||
10946             !atomic_long_inc_not_zero(&parent_event->refcount)) {
10947                 mutex_unlock(&parent_event->child_mutex);
10948                 free_event(child_event);
10949                 return NULL;
10950         }
10951
10952         get_ctx(child_ctx);
10953
10954         /*
10955          * Make the child state follow the state of the parent event,
10956          * not its attr.disabled bit.  We hold the parent's mutex,
10957          * so we won't race with perf_event_{en, dis}able_family.
10958          */
10959         if (parent_state >= PERF_EVENT_STATE_INACTIVE)
10960                 child_event->state = PERF_EVENT_STATE_INACTIVE;
10961         else
10962                 child_event->state = PERF_EVENT_STATE_OFF;
10963
10964         if (parent_event->attr.freq) {
10965                 u64 sample_period = parent_event->hw.sample_period;
10966                 struct hw_perf_event *hwc = &child_event->hw;
10967
10968                 hwc->sample_period = sample_period;
10969                 hwc->last_period   = sample_period;
10970
10971                 local64_set(&hwc->period_left, sample_period);
10972         }
10973
10974         child_event->ctx = child_ctx;
10975         child_event->overflow_handler = parent_event->overflow_handler;
10976         child_event->overflow_handler_context
10977                 = parent_event->overflow_handler_context;
10978
10979         /*
10980          * Precalculate sample_data sizes
10981          */
10982         perf_event__header_size(child_event);
10983         perf_event__id_header_size(child_event);
10984
10985         /*
10986          * Link it up in the child's context:
10987          */
10988         raw_spin_lock_irqsave(&child_ctx->lock, flags);
10989         add_event_to_ctx(child_event, child_ctx);
10990         raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
10991
10992         /*
10993          * Link this into the parent event's child list
10994          */
10995         list_add_tail(&child_event->child_list, &parent_event->child_list);
10996         mutex_unlock(&parent_event->child_mutex);
10997
10998         return child_event;
10999 }
11000
11001 /*
11002  * Inherits an event group.
11003  *
11004  * This will quietly suppress orphaned events; !inherit_event() is not an error.
11005  * This matches with perf_event_release_kernel() removing all child events.
11006  *
11007  * Returns:
11008  *  - 0 on success
11009  *  - <0 on error
11010  */
11011 static int inherit_group(struct perf_event *parent_event,
11012               struct task_struct *parent,
11013               struct perf_event_context *parent_ctx,
11014               struct task_struct *child,
11015               struct perf_event_context *child_ctx)
11016 {
11017         struct perf_event *leader;
11018         struct perf_event *sub;
11019         struct perf_event *child_ctr;
11020
11021         leader = inherit_event(parent_event, parent, parent_ctx,
11022                                  child, NULL, child_ctx);
11023         if (IS_ERR(leader))
11024                 return PTR_ERR(leader);
11025         /*
11026          * @leader can be NULL here because of is_orphaned_event(). In this
11027          * case inherit_event() will create individual events, similar to what
11028          * perf_group_detach() would do anyway.
11029          */
11030         list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
11031                 child_ctr = inherit_event(sub, parent, parent_ctx,
11032                                             child, leader, child_ctx);
11033                 if (IS_ERR(child_ctr))
11034                         return PTR_ERR(child_ctr);
11035         }
11036         return 0;
11037 }
11038
11039 /*
11040  * Creates the child task context and tries to inherit the event-group.
11041  *
11042  * Clears @inherited_all on !attr.inherited or error. Note that we'll leave
11043  * inherited_all set when we 'fail' to inherit an orphaned event; this is
11044  * consistent with perf_event_release_kernel() removing all child events.
11045  *
11046  * Returns:
11047  *  - 0 on success
11048  *  - <0 on error
11049  */
11050 static int
11051 inherit_task_group(struct perf_event *event, struct task_struct *parent,
11052                    struct perf_event_context *parent_ctx,
11053                    struct task_struct *child, int ctxn,
11054                    int *inherited_all)
11055 {
11056         int ret;
11057         struct perf_event_context *child_ctx;
11058
11059         if (!event->attr.inherit) {
11060                 *inherited_all = 0;
11061                 return 0;
11062         }
11063
11064         child_ctx = child->perf_event_ctxp[ctxn];
11065         if (!child_ctx) {
11066                 /*
11067                  * This is executed from the parent task context, so
11068                  * inherit events that have been marked for cloning.
11069                  * First allocate and initialize a context for the
11070                  * child.
11071                  */
11072                 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
11073                 if (!child_ctx)
11074                         return -ENOMEM;
11075
11076                 child->perf_event_ctxp[ctxn] = child_ctx;
11077         }
11078
11079         ret = inherit_group(event, parent, parent_ctx,
11080                             child, child_ctx);
11081
11082         if (ret)
11083                 *inherited_all = 0;
11084
11085         return ret;
11086 }
11087
11088 /*
11089  * Initialize the perf_event context in task_struct
11090  */
11091 static int perf_event_init_context(struct task_struct *child, int ctxn)
11092 {
11093         struct perf_event_context *child_ctx, *parent_ctx;
11094         struct perf_event_context *cloned_ctx;
11095         struct perf_event *event;
11096         struct task_struct *parent = current;
11097         int inherited_all = 1;
11098         unsigned long flags;
11099         int ret = 0;
11100
11101         if (likely(!parent->perf_event_ctxp[ctxn]))
11102                 return 0;
11103
11104         /*
11105          * If the parent's context is a clone, pin it so it won't get
11106          * swapped under us.
11107          */
11108         parent_ctx = perf_pin_task_context(parent, ctxn);
11109         if (!parent_ctx)
11110                 return 0;
11111
11112         /*
11113          * No need to check if parent_ctx != NULL here; since we saw
11114          * it non-NULL earlier, the only reason for it to become NULL
11115          * is if we exit, and since we're currently in the middle of
11116          * a fork we can't be exiting at the same time.
11117          */
11118
11119         /*
11120          * Lock the parent list. No need to lock the child - not PID
11121          * hashed yet and not running, so nobody can access it.
11122          */
11123         mutex_lock(&parent_ctx->mutex);
11124
11125         /*
11126          * We dont have to disable NMIs - we are only looking at
11127          * the list, not manipulating it:
11128          */
11129         list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
11130                 ret = inherit_task_group(event, parent, parent_ctx,
11131                                          child, ctxn, &inherited_all);
11132                 if (ret)
11133                         goto out_unlock;
11134         }
11135
11136         /*
11137          * We can't hold ctx->lock when iterating the ->flexible_group list due
11138          * to allocations, but we need to prevent rotation because
11139          * rotate_ctx() will change the list from interrupt context.
11140          */
11141         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
11142         parent_ctx->rotate_disable = 1;
11143         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
11144
11145         list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
11146                 ret = inherit_task_group(event, parent, parent_ctx,
11147                                          child, ctxn, &inherited_all);
11148                 if (ret)
11149                         goto out_unlock;
11150         }
11151
11152         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
11153         parent_ctx->rotate_disable = 0;
11154
11155         child_ctx = child->perf_event_ctxp[ctxn];
11156
11157         if (child_ctx && inherited_all) {
11158                 /*
11159                  * Mark the child context as a clone of the parent
11160                  * context, or of whatever the parent is a clone of.
11161                  *
11162                  * Note that if the parent is a clone, the holding of
11163                  * parent_ctx->lock avoids it from being uncloned.
11164                  */
11165                 cloned_ctx = parent_ctx->parent_ctx;
11166                 if (cloned_ctx) {
11167                         child_ctx->parent_ctx = cloned_ctx;
11168                         child_ctx->parent_gen = parent_ctx->parent_gen;
11169                 } else {
11170                         child_ctx->parent_ctx = parent_ctx;
11171                         child_ctx->parent_gen = parent_ctx->generation;
11172                 }
11173                 get_ctx(child_ctx->parent_ctx);
11174         }
11175
11176         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
11177 out_unlock:
11178         mutex_unlock(&parent_ctx->mutex);
11179
11180         perf_unpin_context(parent_ctx);
11181         put_ctx(parent_ctx);
11182
11183         return ret;
11184 }
11185
11186 /*
11187  * Initialize the perf_event context in task_struct
11188  */
11189 int perf_event_init_task(struct task_struct *child)
11190 {
11191         int ctxn, ret;
11192
11193         memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
11194         mutex_init(&child->perf_event_mutex);
11195         INIT_LIST_HEAD(&child->perf_event_list);
11196
11197         for_each_task_context_nr(ctxn) {
11198                 ret = perf_event_init_context(child, ctxn);
11199                 if (ret) {
11200                         perf_event_free_task(child);
11201                         return ret;
11202                 }
11203         }
11204
11205         return 0;
11206 }
11207
11208 static void __init perf_event_init_all_cpus(void)
11209 {
11210         struct swevent_htable *swhash;
11211         int cpu;
11212
11213         zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
11214
11215         for_each_possible_cpu(cpu) {
11216                 swhash = &per_cpu(swevent_htable, cpu);
11217                 mutex_init(&swhash->hlist_mutex);
11218                 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
11219
11220                 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
11221                 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
11222
11223 #ifdef CONFIG_CGROUP_PERF
11224                 INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
11225 #endif
11226                 INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
11227         }
11228 }
11229
11230 void perf_swevent_init_cpu(unsigned int cpu)
11231 {
11232         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
11233
11234         mutex_lock(&swhash->hlist_mutex);
11235         if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
11236                 struct swevent_hlist *hlist;
11237
11238                 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
11239                 WARN_ON(!hlist);
11240                 rcu_assign_pointer(swhash->swevent_hlist, hlist);
11241         }
11242         mutex_unlock(&swhash->hlist_mutex);
11243 }
11244
11245 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
11246 static void __perf_event_exit_context(void *__info)
11247 {
11248         struct perf_event_context *ctx = __info;
11249         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
11250         struct perf_event *event;
11251
11252         raw_spin_lock(&ctx->lock);
11253         list_for_each_entry(event, &ctx->event_list, event_entry)
11254                 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
11255         raw_spin_unlock(&ctx->lock);
11256 }
11257
11258 static void perf_event_exit_cpu_context(int cpu)
11259 {
11260         struct perf_cpu_context *cpuctx;
11261         struct perf_event_context *ctx;
11262         struct pmu *pmu;
11263
11264         mutex_lock(&pmus_lock);
11265         list_for_each_entry(pmu, &pmus, entry) {
11266                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
11267                 ctx = &cpuctx->ctx;
11268
11269                 mutex_lock(&ctx->mutex);
11270                 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
11271                 cpuctx->online = 0;
11272                 mutex_unlock(&ctx->mutex);
11273         }
11274         cpumask_clear_cpu(cpu, perf_online_mask);
11275         mutex_unlock(&pmus_lock);
11276 }
11277 #else
11278
11279 static void perf_event_exit_cpu_context(int cpu) { }
11280
11281 #endif
11282
11283 int perf_event_init_cpu(unsigned int cpu)
11284 {
11285         struct perf_cpu_context *cpuctx;
11286         struct perf_event_context *ctx;
11287         struct pmu *pmu;
11288
11289         perf_swevent_init_cpu(cpu);
11290
11291         mutex_lock(&pmus_lock);
11292         cpumask_set_cpu(cpu, perf_online_mask);
11293         list_for_each_entry(pmu, &pmus, entry) {
11294                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
11295                 ctx = &cpuctx->ctx;
11296
11297                 mutex_lock(&ctx->mutex);
11298                 cpuctx->online = 1;
11299                 mutex_unlock(&ctx->mutex);
11300         }
11301         mutex_unlock(&pmus_lock);
11302
11303         return 0;
11304 }
11305
11306 int perf_event_exit_cpu(unsigned int cpu)
11307 {
11308         perf_event_exit_cpu_context(cpu);
11309         return 0;
11310 }
11311
11312 static int
11313 perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
11314 {
11315         int cpu;
11316
11317         for_each_online_cpu(cpu)
11318                 perf_event_exit_cpu(cpu);
11319
11320         return NOTIFY_OK;
11321 }
11322
11323 /*
11324  * Run the perf reboot notifier at the very last possible moment so that
11325  * the generic watchdog code runs as long as possible.
11326  */
11327 static struct notifier_block perf_reboot_notifier = {
11328         .notifier_call = perf_reboot,
11329         .priority = INT_MIN,
11330 };
11331
11332 void __init perf_event_init(void)
11333 {
11334         int ret;
11335
11336         idr_init(&pmu_idr);
11337
11338         perf_event_init_all_cpus();
11339         init_srcu_struct(&pmus_srcu);
11340         perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
11341         perf_pmu_register(&perf_cpu_clock, NULL, -1);
11342         perf_pmu_register(&perf_task_clock, NULL, -1);
11343         perf_tp_register();
11344         perf_event_init_cpu(smp_processor_id());
11345         register_reboot_notifier(&perf_reboot_notifier);
11346
11347         ret = init_hw_breakpoint();
11348         WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
11349
11350         /*
11351          * Build time assertion that we keep the data_head at the intended
11352          * location.  IOW, validation we got the __reserved[] size right.
11353          */
11354         BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
11355                      != 1024);
11356 }
11357
11358 ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
11359                               char *page)
11360 {
11361         struct perf_pmu_events_attr *pmu_attr =
11362                 container_of(attr, struct perf_pmu_events_attr, attr);
11363
11364         if (pmu_attr->event_str)
11365                 return sprintf(page, "%s\n", pmu_attr->event_str);
11366
11367         return 0;
11368 }
11369 EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
11370
11371 static int __init perf_event_sysfs_init(void)
11372 {
11373         struct pmu *pmu;
11374         int ret;
11375
11376         mutex_lock(&pmus_lock);
11377
11378         ret = bus_register(&pmu_bus);
11379         if (ret)
11380                 goto unlock;
11381
11382         list_for_each_entry(pmu, &pmus, entry) {
11383                 if (!pmu->name || pmu->type < 0)
11384                         continue;
11385
11386                 ret = pmu_dev_alloc(pmu);
11387                 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
11388         }
11389         pmu_bus_running = 1;
11390         ret = 0;
11391
11392 unlock:
11393         mutex_unlock(&pmus_lock);
11394
11395         return ret;
11396 }
11397 device_initcall(perf_event_sysfs_init);
11398
11399 #ifdef CONFIG_CGROUP_PERF
11400 static struct cgroup_subsys_state *
11401 perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
11402 {
11403         struct perf_cgroup *jc;
11404
11405         jc = kzalloc(sizeof(*jc), GFP_KERNEL);
11406         if (!jc)
11407                 return ERR_PTR(-ENOMEM);
11408
11409         jc->info = alloc_percpu(struct perf_cgroup_info);
11410         if (!jc->info) {
11411                 kfree(jc);
11412                 return ERR_PTR(-ENOMEM);
11413         }
11414
11415         return &jc->css;
11416 }
11417
11418 static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
11419 {
11420         struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
11421
11422         free_percpu(jc->info);
11423         kfree(jc);
11424 }
11425
11426 static int __perf_cgroup_move(void *info)
11427 {
11428         struct task_struct *task = info;
11429         rcu_read_lock();
11430         perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
11431         rcu_read_unlock();
11432         return 0;
11433 }
11434
11435 static void perf_cgroup_attach(struct cgroup_taskset *tset)
11436 {
11437         struct task_struct *task;
11438         struct cgroup_subsys_state *css;
11439
11440         cgroup_taskset_for_each(task, css, tset)
11441                 task_function_call(task, __perf_cgroup_move, task);
11442 }
11443
11444 struct cgroup_subsys perf_event_cgrp_subsys = {
11445         .css_alloc      = perf_cgroup_css_alloc,
11446         .css_free       = perf_cgroup_css_free,
11447         .attach         = perf_cgroup_attach,
11448         /*
11449          * Implicitly enable on dfl hierarchy so that perf events can
11450          * always be filtered by cgroup2 path as long as perf_event
11451          * controller is not mounted on a legacy hierarchy.
11452          */
11453         .implicit_on_dfl = true,
11454         .threaded       = true,
11455 };
11456 #endif /* CONFIG_CGROUP_PERF */