GNU Linux-libre 6.8.9-gnu
[releases.git] / tools / perf / builtin-record.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
36 #include "util/tsc.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/pmu.h"
50 #include "util/pmus.h"
51 #include "util/clockid.h"
52 #include "util/off_cpu.h"
53 #include "util/bpf-filter.h"
54 #include "asm/bug.h"
55 #include "perf.h"
56 #include "cputopo.h"
57
58 #include <errno.h>
59 #include <inttypes.h>
60 #include <locale.h>
61 #include <poll.h>
62 #include <pthread.h>
63 #include <unistd.h>
64 #ifndef HAVE_GETTID
65 #include <syscall.h>
66 #endif
67 #include <sched.h>
68 #include <signal.h>
69 #ifdef HAVE_EVENTFD_SUPPORT
70 #include <sys/eventfd.h>
71 #endif
72 #include <sys/mman.h>
73 #include <sys/wait.h>
74 #include <sys/types.h>
75 #include <sys/stat.h>
76 #include <fcntl.h>
77 #include <linux/err.h>
78 #include <linux/string.h>
79 #include <linux/time64.h>
80 #include <linux/zalloc.h>
81 #include <linux/bitmap.h>
82 #include <sys/time.h>
83
84 struct switch_output {
85         bool             enabled;
86         bool             signal;
87         unsigned long    size;
88         unsigned long    time;
89         const char      *str;
90         bool             set;
91         char             **filenames;
92         int              num_files;
93         int              cur_file;
94 };
95
96 struct thread_mask {
97         struct mmap_cpu_mask    maps;
98         struct mmap_cpu_mask    affinity;
99 };
100
101 struct record_thread {
102         pid_t                   tid;
103         struct thread_mask      *mask;
104         struct {
105                 int             msg[2];
106                 int             ack[2];
107         } pipes;
108         struct fdarray          pollfd;
109         int                     ctlfd_pos;
110         int                     nr_mmaps;
111         struct mmap             **maps;
112         struct mmap             **overwrite_maps;
113         struct record           *rec;
114         unsigned long long      samples;
115         unsigned long           waking;
116         u64                     bytes_written;
117         u64                     bytes_transferred;
118         u64                     bytes_compressed;
119 };
120
121 static __thread struct record_thread *thread;
122
123 enum thread_msg {
124         THREAD_MSG__UNDEFINED = 0,
125         THREAD_MSG__READY,
126         THREAD_MSG__MAX,
127 };
128
129 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
130         "UNDEFINED", "READY"
131 };
132
133 enum thread_spec {
134         THREAD_SPEC__UNDEFINED = 0,
135         THREAD_SPEC__CPU,
136         THREAD_SPEC__CORE,
137         THREAD_SPEC__PACKAGE,
138         THREAD_SPEC__NUMA,
139         THREAD_SPEC__USER,
140         THREAD_SPEC__MAX,
141 };
142
143 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
144         "undefined", "cpu", "core", "package", "numa", "user"
145 };
146
147 struct pollfd_index_map {
148         int evlist_pollfd_index;
149         int thread_pollfd_index;
150 };
151
152 struct record {
153         struct perf_tool        tool;
154         struct record_opts      opts;
155         u64                     bytes_written;
156         u64                     thread_bytes_written;
157         struct perf_data        data;
158         struct auxtrace_record  *itr;
159         struct evlist   *evlist;
160         struct perf_session     *session;
161         struct evlist           *sb_evlist;
162         pthread_t               thread_id;
163         int                     realtime_prio;
164         bool                    switch_output_event_set;
165         bool                    no_buildid;
166         bool                    no_buildid_set;
167         bool                    no_buildid_cache;
168         bool                    no_buildid_cache_set;
169         bool                    buildid_all;
170         bool                    buildid_mmap;
171         bool                    timestamp_filename;
172         bool                    timestamp_boundary;
173         bool                    off_cpu;
174         struct switch_output    switch_output;
175         unsigned long long      samples;
176         unsigned long           output_max_size;        /* = 0: unlimited */
177         struct perf_debuginfod  debuginfod;
178         int                     nr_threads;
179         struct thread_mask      *thread_masks;
180         struct record_thread    *thread_data;
181         struct pollfd_index_map *index_map;
182         size_t                  index_map_sz;
183         size_t                  index_map_cnt;
184 };
185
186 static volatile int done;
187
188 static volatile int auxtrace_record__snapshot_started;
189 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
190 static DEFINE_TRIGGER(switch_output_trigger);
191
192 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
193         "SYS", "NODE", "CPU"
194 };
195
196 #ifndef HAVE_GETTID
197 static inline pid_t gettid(void)
198 {
199         return (pid_t)syscall(__NR_gettid);
200 }
201 #endif
202
203 static int record__threads_enabled(struct record *rec)
204 {
205         return rec->opts.threads_spec;
206 }
207
208 static bool switch_output_signal(struct record *rec)
209 {
210         return rec->switch_output.signal &&
211                trigger_is_ready(&switch_output_trigger);
212 }
213
214 static bool switch_output_size(struct record *rec)
215 {
216         return rec->switch_output.size &&
217                trigger_is_ready(&switch_output_trigger) &&
218                (rec->bytes_written >= rec->switch_output.size);
219 }
220
221 static bool switch_output_time(struct record *rec)
222 {
223         return rec->switch_output.time &&
224                trigger_is_ready(&switch_output_trigger);
225 }
226
227 static u64 record__bytes_written(struct record *rec)
228 {
229         return rec->bytes_written + rec->thread_bytes_written;
230 }
231
232 static bool record__output_max_size_exceeded(struct record *rec)
233 {
234         return rec->output_max_size &&
235                (record__bytes_written(rec) >= rec->output_max_size);
236 }
237
238 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
239                          void *bf, size_t size)
240 {
241         struct perf_data_file *file = &rec->session->data->file;
242
243         if (map && map->file)
244                 file = map->file;
245
246         if (perf_data_file__write(file, bf, size) < 0) {
247                 pr_err("failed to write perf data, error: %m\n");
248                 return -1;
249         }
250
251         if (map && map->file) {
252                 thread->bytes_written += size;
253                 rec->thread_bytes_written += size;
254         } else {
255                 rec->bytes_written += size;
256         }
257
258         if (record__output_max_size_exceeded(rec) && !done) {
259                 fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
260                                 " stopping session ]\n",
261                                 record__bytes_written(rec) >> 10);
262                 done = 1;
263         }
264
265         if (switch_output_size(rec))
266                 trigger_hit(&switch_output_trigger);
267
268         return 0;
269 }
270
271 static int record__aio_enabled(struct record *rec);
272 static int record__comp_enabled(struct record *rec);
273 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
274                             void *dst, size_t dst_size, void *src, size_t src_size);
275
276 #ifdef HAVE_AIO_SUPPORT
277 static int record__aio_write(struct aiocb *cblock, int trace_fd,
278                 void *buf, size_t size, off_t off)
279 {
280         int rc;
281
282         cblock->aio_fildes = trace_fd;
283         cblock->aio_buf    = buf;
284         cblock->aio_nbytes = size;
285         cblock->aio_offset = off;
286         cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
287
288         do {
289                 rc = aio_write(cblock);
290                 if (rc == 0) {
291                         break;
292                 } else if (errno != EAGAIN) {
293                         cblock->aio_fildes = -1;
294                         pr_err("failed to queue perf data, error: %m\n");
295                         break;
296                 }
297         } while (1);
298
299         return rc;
300 }
301
302 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
303 {
304         void *rem_buf;
305         off_t rem_off;
306         size_t rem_size;
307         int rc, aio_errno;
308         ssize_t aio_ret, written;
309
310         aio_errno = aio_error(cblock);
311         if (aio_errno == EINPROGRESS)
312                 return 0;
313
314         written = aio_ret = aio_return(cblock);
315         if (aio_ret < 0) {
316                 if (aio_errno != EINTR)
317                         pr_err("failed to write perf data, error: %m\n");
318                 written = 0;
319         }
320
321         rem_size = cblock->aio_nbytes - written;
322
323         if (rem_size == 0) {
324                 cblock->aio_fildes = -1;
325                 /*
326                  * md->refcount is incremented in record__aio_pushfn() for
327                  * every aio write request started in record__aio_push() so
328                  * decrement it because the request is now complete.
329                  */
330                 perf_mmap__put(&md->core);
331                 rc = 1;
332         } else {
333                 /*
334                  * aio write request may require restart with the
335                  * reminder if the kernel didn't write whole
336                  * chunk at once.
337                  */
338                 rem_off = cblock->aio_offset + written;
339                 rem_buf = (void *)(cblock->aio_buf + written);
340                 record__aio_write(cblock, cblock->aio_fildes,
341                                 rem_buf, rem_size, rem_off);
342                 rc = 0;
343         }
344
345         return rc;
346 }
347
348 static int record__aio_sync(struct mmap *md, bool sync_all)
349 {
350         struct aiocb **aiocb = md->aio.aiocb;
351         struct aiocb *cblocks = md->aio.cblocks;
352         struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
353         int i, do_suspend;
354
355         do {
356                 do_suspend = 0;
357                 for (i = 0; i < md->aio.nr_cblocks; ++i) {
358                         if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
359                                 if (sync_all)
360                                         aiocb[i] = NULL;
361                                 else
362                                         return i;
363                         } else {
364                                 /*
365                                  * Started aio write is not complete yet
366                                  * so it has to be waited before the
367                                  * next allocation.
368                                  */
369                                 aiocb[i] = &cblocks[i];
370                                 do_suspend = 1;
371                         }
372                 }
373                 if (!do_suspend)
374                         return -1;
375
376                 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
377                         if (!(errno == EAGAIN || errno == EINTR))
378                                 pr_err("failed to sync perf data, error: %m\n");
379                 }
380         } while (1);
381 }
382
383 struct record_aio {
384         struct record   *rec;
385         void            *data;
386         size_t          size;
387 };
388
389 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
390 {
391         struct record_aio *aio = to;
392
393         /*
394          * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
395          * to release space in the kernel buffer as fast as possible, calling
396          * perf_mmap__consume() from perf_mmap__push() function.
397          *
398          * That lets the kernel to proceed with storing more profiling data into
399          * the kernel buffer earlier than other per-cpu kernel buffers are handled.
400          *
401          * Coping can be done in two steps in case the chunk of profiling data
402          * crosses the upper bound of the kernel buffer. In this case we first move
403          * part of data from map->start till the upper bound and then the reminder
404          * from the beginning of the kernel buffer till the end of the data chunk.
405          */
406
407         if (record__comp_enabled(aio->rec)) {
408                 ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
409                                                    mmap__mmap_len(map) - aio->size,
410                                                    buf, size);
411                 if (compressed < 0)
412                         return (int)compressed;
413
414                 size = compressed;
415         } else {
416                 memcpy(aio->data + aio->size, buf, size);
417         }
418
419         if (!aio->size) {
420                 /*
421                  * Increment map->refcount to guard map->aio.data[] buffer
422                  * from premature deallocation because map object can be
423                  * released earlier than aio write request started on
424                  * map->aio.data[] buffer is complete.
425                  *
426                  * perf_mmap__put() is done at record__aio_complete()
427                  * after started aio request completion or at record__aio_push()
428                  * if the request failed to start.
429                  */
430                 perf_mmap__get(&map->core);
431         }
432
433         aio->size += size;
434
435         return size;
436 }
437
438 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
439 {
440         int ret, idx;
441         int trace_fd = rec->session->data->file.fd;
442         struct record_aio aio = { .rec = rec, .size = 0 };
443
444         /*
445          * Call record__aio_sync() to wait till map->aio.data[] buffer
446          * becomes available after previous aio write operation.
447          */
448
449         idx = record__aio_sync(map, false);
450         aio.data = map->aio.data[idx];
451         ret = perf_mmap__push(map, &aio, record__aio_pushfn);
452         if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
453                 return ret;
454
455         rec->samples++;
456         ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
457         if (!ret) {
458                 *off += aio.size;
459                 rec->bytes_written += aio.size;
460                 if (switch_output_size(rec))
461                         trigger_hit(&switch_output_trigger);
462         } else {
463                 /*
464                  * Decrement map->refcount incremented in record__aio_pushfn()
465                  * back if record__aio_write() operation failed to start, otherwise
466                  * map->refcount is decremented in record__aio_complete() after
467                  * aio write operation finishes successfully.
468                  */
469                 perf_mmap__put(&map->core);
470         }
471
472         return ret;
473 }
474
475 static off_t record__aio_get_pos(int trace_fd)
476 {
477         return lseek(trace_fd, 0, SEEK_CUR);
478 }
479
480 static void record__aio_set_pos(int trace_fd, off_t pos)
481 {
482         lseek(trace_fd, pos, SEEK_SET);
483 }
484
485 static void record__aio_mmap_read_sync(struct record *rec)
486 {
487         int i;
488         struct evlist *evlist = rec->evlist;
489         struct mmap *maps = evlist->mmap;
490
491         if (!record__aio_enabled(rec))
492                 return;
493
494         for (i = 0; i < evlist->core.nr_mmaps; i++) {
495                 struct mmap *map = &maps[i];
496
497                 if (map->core.base)
498                         record__aio_sync(map, true);
499         }
500 }
501
502 static int nr_cblocks_default = 1;
503 static int nr_cblocks_max = 4;
504
505 static int record__aio_parse(const struct option *opt,
506                              const char *str,
507                              int unset)
508 {
509         struct record_opts *opts = (struct record_opts *)opt->value;
510
511         if (unset) {
512                 opts->nr_cblocks = 0;
513         } else {
514                 if (str)
515                         opts->nr_cblocks = strtol(str, NULL, 0);
516                 if (!opts->nr_cblocks)
517                         opts->nr_cblocks = nr_cblocks_default;
518         }
519
520         return 0;
521 }
522 #else /* HAVE_AIO_SUPPORT */
523 static int nr_cblocks_max = 0;
524
525 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
526                             off_t *off __maybe_unused)
527 {
528         return -1;
529 }
530
531 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
532 {
533         return -1;
534 }
535
536 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
537 {
538 }
539
540 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
541 {
542 }
543 #endif
544
545 static int record__aio_enabled(struct record *rec)
546 {
547         return rec->opts.nr_cblocks > 0;
548 }
549
550 #define MMAP_FLUSH_DEFAULT 1
551 static int record__mmap_flush_parse(const struct option *opt,
552                                     const char *str,
553                                     int unset)
554 {
555         int flush_max;
556         struct record_opts *opts = (struct record_opts *)opt->value;
557         static struct parse_tag tags[] = {
558                         { .tag  = 'B', .mult = 1       },
559                         { .tag  = 'K', .mult = 1 << 10 },
560                         { .tag  = 'M', .mult = 1 << 20 },
561                         { .tag  = 'G', .mult = 1 << 30 },
562                         { .tag  = 0 },
563         };
564
565         if (unset)
566                 return 0;
567
568         if (str) {
569                 opts->mmap_flush = parse_tag_value(str, tags);
570                 if (opts->mmap_flush == (int)-1)
571                         opts->mmap_flush = strtol(str, NULL, 0);
572         }
573
574         if (!opts->mmap_flush)
575                 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
576
577         flush_max = evlist__mmap_size(opts->mmap_pages);
578         flush_max /= 4;
579         if (opts->mmap_flush > flush_max)
580                 opts->mmap_flush = flush_max;
581
582         return 0;
583 }
584
585 #ifdef HAVE_ZSTD_SUPPORT
586 static unsigned int comp_level_default = 1;
587
588 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
589 {
590         struct record_opts *opts = opt->value;
591
592         if (unset) {
593                 opts->comp_level = 0;
594         } else {
595                 if (str)
596                         opts->comp_level = strtol(str, NULL, 0);
597                 if (!opts->comp_level)
598                         opts->comp_level = comp_level_default;
599         }
600
601         return 0;
602 }
603 #endif
604 static unsigned int comp_level_max = 22;
605
606 static int record__comp_enabled(struct record *rec)
607 {
608         return rec->opts.comp_level > 0;
609 }
610
611 static int process_synthesized_event(struct perf_tool *tool,
612                                      union perf_event *event,
613                                      struct perf_sample *sample __maybe_unused,
614                                      struct machine *machine __maybe_unused)
615 {
616         struct record *rec = container_of(tool, struct record, tool);
617         return record__write(rec, NULL, event, event->header.size);
618 }
619
620 static struct mutex synth_lock;
621
622 static int process_locked_synthesized_event(struct perf_tool *tool,
623                                      union perf_event *event,
624                                      struct perf_sample *sample __maybe_unused,
625                                      struct machine *machine __maybe_unused)
626 {
627         int ret;
628
629         mutex_lock(&synth_lock);
630         ret = process_synthesized_event(tool, event, sample, machine);
631         mutex_unlock(&synth_lock);
632         return ret;
633 }
634
635 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
636 {
637         struct record *rec = to;
638
639         if (record__comp_enabled(rec)) {
640                 ssize_t compressed = zstd_compress(rec->session, map, map->data,
641                                                    mmap__mmap_len(map), bf, size);
642
643                 if (compressed < 0)
644                         return (int)compressed;
645
646                 size = compressed;
647                 bf   = map->data;
648         }
649
650         thread->samples++;
651         return record__write(rec, map, bf, size);
652 }
653
654 static volatile sig_atomic_t signr = -1;
655 static volatile sig_atomic_t child_finished;
656 #ifdef HAVE_EVENTFD_SUPPORT
657 static volatile sig_atomic_t done_fd = -1;
658 #endif
659
660 static void sig_handler(int sig)
661 {
662         if (sig == SIGCHLD)
663                 child_finished = 1;
664         else
665                 signr = sig;
666
667         done = 1;
668 #ifdef HAVE_EVENTFD_SUPPORT
669         if (done_fd >= 0) {
670                 u64 tmp = 1;
671                 int orig_errno = errno;
672
673                 /*
674                  * It is possible for this signal handler to run after done is
675                  * checked in the main loop, but before the perf counter fds are
676                  * polled. If this happens, the poll() will continue to wait
677                  * even though done is set, and will only break out if either
678                  * another signal is received, or the counters are ready for
679                  * read. To ensure the poll() doesn't sleep when done is set,
680                  * use an eventfd (done_fd) to wake up the poll().
681                  */
682                 if (write(done_fd, &tmp, sizeof(tmp)) < 0)
683                         pr_err("failed to signal wakeup fd, error: %m\n");
684
685                 errno = orig_errno;
686         }
687 #endif // HAVE_EVENTFD_SUPPORT
688 }
689
690 static void sigsegv_handler(int sig)
691 {
692         perf_hooks__recover();
693         sighandler_dump_stack(sig);
694 }
695
696 static void record__sig_exit(void)
697 {
698         if (signr == -1)
699                 return;
700
701         signal(signr, SIG_DFL);
702         raise(signr);
703 }
704
705 #ifdef HAVE_AUXTRACE_SUPPORT
706
707 static int record__process_auxtrace(struct perf_tool *tool,
708                                     struct mmap *map,
709                                     union perf_event *event, void *data1,
710                                     size_t len1, void *data2, size_t len2)
711 {
712         struct record *rec = container_of(tool, struct record, tool);
713         struct perf_data *data = &rec->data;
714         size_t padding;
715         u8 pad[8] = {0};
716
717         if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
718                 off_t file_offset;
719                 int fd = perf_data__fd(data);
720                 int err;
721
722                 file_offset = lseek(fd, 0, SEEK_CUR);
723                 if (file_offset == -1)
724                         return -1;
725                 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
726                                                      event, file_offset);
727                 if (err)
728                         return err;
729         }
730
731         /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
732         padding = (len1 + len2) & 7;
733         if (padding)
734                 padding = 8 - padding;
735
736         record__write(rec, map, event, event->header.size);
737         record__write(rec, map, data1, len1);
738         if (len2)
739                 record__write(rec, map, data2, len2);
740         record__write(rec, map, &pad, padding);
741
742         return 0;
743 }
744
745 static int record__auxtrace_mmap_read(struct record *rec,
746                                       struct mmap *map)
747 {
748         int ret;
749
750         ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
751                                   record__process_auxtrace);
752         if (ret < 0)
753                 return ret;
754
755         if (ret)
756                 rec->samples++;
757
758         return 0;
759 }
760
761 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
762                                                struct mmap *map)
763 {
764         int ret;
765
766         ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
767                                            record__process_auxtrace,
768                                            rec->opts.auxtrace_snapshot_size);
769         if (ret < 0)
770                 return ret;
771
772         if (ret)
773                 rec->samples++;
774
775         return 0;
776 }
777
778 static int record__auxtrace_read_snapshot_all(struct record *rec)
779 {
780         int i;
781         int rc = 0;
782
783         for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
784                 struct mmap *map = &rec->evlist->mmap[i];
785
786                 if (!map->auxtrace_mmap.base)
787                         continue;
788
789                 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
790                         rc = -1;
791                         goto out;
792                 }
793         }
794 out:
795         return rc;
796 }
797
798 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
799 {
800         pr_debug("Recording AUX area tracing snapshot\n");
801         if (record__auxtrace_read_snapshot_all(rec) < 0) {
802                 trigger_error(&auxtrace_snapshot_trigger);
803         } else {
804                 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
805                         trigger_error(&auxtrace_snapshot_trigger);
806                 else
807                         trigger_ready(&auxtrace_snapshot_trigger);
808         }
809 }
810
811 static int record__auxtrace_snapshot_exit(struct record *rec)
812 {
813         if (trigger_is_error(&auxtrace_snapshot_trigger))
814                 return 0;
815
816         if (!auxtrace_record__snapshot_started &&
817             auxtrace_record__snapshot_start(rec->itr))
818                 return -1;
819
820         record__read_auxtrace_snapshot(rec, true);
821         if (trigger_is_error(&auxtrace_snapshot_trigger))
822                 return -1;
823
824         return 0;
825 }
826
827 static int record__auxtrace_init(struct record *rec)
828 {
829         int err;
830
831         if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
832             && record__threads_enabled(rec)) {
833                 pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
834                 return -EINVAL;
835         }
836
837         if (!rec->itr) {
838                 rec->itr = auxtrace_record__init(rec->evlist, &err);
839                 if (err)
840                         return err;
841         }
842
843         err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
844                                               rec->opts.auxtrace_snapshot_opts);
845         if (err)
846                 return err;
847
848         err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
849                                             rec->opts.auxtrace_sample_opts);
850         if (err)
851                 return err;
852
853         auxtrace_regroup_aux_output(rec->evlist);
854
855         return auxtrace_parse_filters(rec->evlist);
856 }
857
858 #else
859
860 static inline
861 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
862                                struct mmap *map __maybe_unused)
863 {
864         return 0;
865 }
866
867 static inline
868 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
869                                     bool on_exit __maybe_unused)
870 {
871 }
872
873 static inline
874 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
875 {
876         return 0;
877 }
878
879 static inline
880 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
881 {
882         return 0;
883 }
884
885 static int record__auxtrace_init(struct record *rec __maybe_unused)
886 {
887         return 0;
888 }
889
890 #endif
891
892 static int record__config_text_poke(struct evlist *evlist)
893 {
894         struct evsel *evsel;
895
896         /* Nothing to do if text poke is already configured */
897         evlist__for_each_entry(evlist, evsel) {
898                 if (evsel->core.attr.text_poke)
899                         return 0;
900         }
901
902         evsel = evlist__add_dummy_on_all_cpus(evlist);
903         if (!evsel)
904                 return -ENOMEM;
905
906         evsel->core.attr.text_poke = 1;
907         evsel->core.attr.ksymbol = 1;
908         evsel->immediate = true;
909         evsel__set_sample_bit(evsel, TIME);
910
911         return 0;
912 }
913
914 static int record__config_off_cpu(struct record *rec)
915 {
916         return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
917 }
918
919 static bool record__tracking_system_wide(struct record *rec)
920 {
921         struct evlist *evlist = rec->evlist;
922         struct evsel *evsel;
923
924         /*
925          * If non-dummy evsel exists, system_wide sideband is need to
926          * help parse sample information.
927          * For example, PERF_EVENT_MMAP event to help parse symbol,
928          * and PERF_EVENT_COMM event to help parse task executable name.
929          */
930         evlist__for_each_entry(evlist, evsel) {
931                 if (!evsel__is_dummy_event(evsel))
932                         return true;
933         }
934
935         return false;
936 }
937
938 static int record__config_tracking_events(struct record *rec)
939 {
940         struct record_opts *opts = &rec->opts;
941         struct evlist *evlist = rec->evlist;
942         bool system_wide = false;
943         struct evsel *evsel;
944
945         /*
946          * For initial_delay, system wide or a hybrid system, we need to add
947          * tracking event so that we can track PERF_RECORD_MMAP to cover the
948          * delay of waiting or event synthesis.
949          */
950         if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
951             perf_pmus__num_core_pmus() > 1) {
952
953                 /*
954                  * User space tasks can migrate between CPUs, so when tracing
955                  * selected CPUs, sideband for all CPUs is still needed.
956                  */
957                 if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
958                         system_wide = true;
959
960                 evsel = evlist__findnew_tracking_event(evlist, system_wide);
961                 if (!evsel)
962                         return -ENOMEM;
963
964                 /*
965                  * Enable the tracking event when the process is forked for
966                  * initial_delay, immediately for system wide.
967                  */
968                 if (opts->target.initial_delay && !evsel->immediate &&
969                     !target__has_cpu(&opts->target))
970                         evsel->core.attr.enable_on_exec = 1;
971                 else
972                         evsel->immediate = 1;
973         }
974
975         return 0;
976 }
977
978 static bool record__kcore_readable(struct machine *machine)
979 {
980         char kcore[PATH_MAX];
981         int fd;
982
983         scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
984
985         fd = open(kcore, O_RDONLY);
986         if (fd < 0)
987                 return false;
988
989         close(fd);
990
991         return true;
992 }
993
994 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
995 {
996         char from_dir[PATH_MAX];
997         char kcore_dir[PATH_MAX];
998         int ret;
999
1000         snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
1001
1002         ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
1003         if (ret)
1004                 return ret;
1005
1006         return kcore_copy(from_dir, kcore_dir);
1007 }
1008
1009 static void record__thread_data_init_pipes(struct record_thread *thread_data)
1010 {
1011         thread_data->pipes.msg[0] = -1;
1012         thread_data->pipes.msg[1] = -1;
1013         thread_data->pipes.ack[0] = -1;
1014         thread_data->pipes.ack[1] = -1;
1015 }
1016
1017 static int record__thread_data_open_pipes(struct record_thread *thread_data)
1018 {
1019         if (pipe(thread_data->pipes.msg))
1020                 return -EINVAL;
1021
1022         if (pipe(thread_data->pipes.ack)) {
1023                 close(thread_data->pipes.msg[0]);
1024                 thread_data->pipes.msg[0] = -1;
1025                 close(thread_data->pipes.msg[1]);
1026                 thread_data->pipes.msg[1] = -1;
1027                 return -EINVAL;
1028         }
1029
1030         pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
1031                  thread_data->pipes.msg[0], thread_data->pipes.msg[1],
1032                  thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
1033
1034         return 0;
1035 }
1036
1037 static void record__thread_data_close_pipes(struct record_thread *thread_data)
1038 {
1039         if (thread_data->pipes.msg[0] != -1) {
1040                 close(thread_data->pipes.msg[0]);
1041                 thread_data->pipes.msg[0] = -1;
1042         }
1043         if (thread_data->pipes.msg[1] != -1) {
1044                 close(thread_data->pipes.msg[1]);
1045                 thread_data->pipes.msg[1] = -1;
1046         }
1047         if (thread_data->pipes.ack[0] != -1) {
1048                 close(thread_data->pipes.ack[0]);
1049                 thread_data->pipes.ack[0] = -1;
1050         }
1051         if (thread_data->pipes.ack[1] != -1) {
1052                 close(thread_data->pipes.ack[1]);
1053                 thread_data->pipes.ack[1] = -1;
1054         }
1055 }
1056
1057 static bool evlist__per_thread(struct evlist *evlist)
1058 {
1059         return cpu_map__is_dummy(evlist->core.user_requested_cpus);
1060 }
1061
1062 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1063 {
1064         int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1065         struct mmap *mmap = evlist->mmap;
1066         struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1067         struct perf_cpu_map *cpus = evlist->core.all_cpus;
1068         bool per_thread = evlist__per_thread(evlist);
1069
1070         if (per_thread)
1071                 thread_data->nr_mmaps = nr_mmaps;
1072         else
1073                 thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1074                                                       thread_data->mask->maps.nbits);
1075         if (mmap) {
1076                 thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1077                 if (!thread_data->maps)
1078                         return -ENOMEM;
1079         }
1080         if (overwrite_mmap) {
1081                 thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1082                 if (!thread_data->overwrite_maps) {
1083                         zfree(&thread_data->maps);
1084                         return -ENOMEM;
1085                 }
1086         }
1087         pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1088                  thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1089
1090         for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1091                 if (per_thread ||
1092                     test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1093                         if (thread_data->maps) {
1094                                 thread_data->maps[tm] = &mmap[m];
1095                                 pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1096                                           thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1097                         }
1098                         if (thread_data->overwrite_maps) {
1099                                 thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1100                                 pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1101                                           thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1102                         }
1103                         tm++;
1104                 }
1105         }
1106
1107         return 0;
1108 }
1109
1110 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1111 {
1112         int f, tm, pos;
1113         struct mmap *map, *overwrite_map;
1114
1115         fdarray__init(&thread_data->pollfd, 64);
1116
1117         for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1118                 map = thread_data->maps ? thread_data->maps[tm] : NULL;
1119                 overwrite_map = thread_data->overwrite_maps ?
1120                                 thread_data->overwrite_maps[tm] : NULL;
1121
1122                 for (f = 0; f < evlist->core.pollfd.nr; f++) {
1123                         void *ptr = evlist->core.pollfd.priv[f].ptr;
1124
1125                         if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1126                                 pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1127                                                               &evlist->core.pollfd);
1128                                 if (pos < 0)
1129                                         return pos;
1130                                 pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1131                                          thread_data, pos, evlist->core.pollfd.entries[f].fd);
1132                         }
1133                 }
1134         }
1135
1136         return 0;
1137 }
1138
1139 static void record__free_thread_data(struct record *rec)
1140 {
1141         int t;
1142         struct record_thread *thread_data = rec->thread_data;
1143
1144         if (thread_data == NULL)
1145                 return;
1146
1147         for (t = 0; t < rec->nr_threads; t++) {
1148                 record__thread_data_close_pipes(&thread_data[t]);
1149                 zfree(&thread_data[t].maps);
1150                 zfree(&thread_data[t].overwrite_maps);
1151                 fdarray__exit(&thread_data[t].pollfd);
1152         }
1153
1154         zfree(&rec->thread_data);
1155 }
1156
1157 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1158                                                     int evlist_pollfd_index,
1159                                                     int thread_pollfd_index)
1160 {
1161         size_t x = rec->index_map_cnt;
1162
1163         if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1164                 return -ENOMEM;
1165         rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1166         rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1167         rec->index_map_cnt += 1;
1168         return 0;
1169 }
1170
1171 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1172                                                     struct evlist *evlist,
1173                                                     struct record_thread *thread_data)
1174 {
1175         struct pollfd *e_entries = evlist->core.pollfd.entries;
1176         struct pollfd *t_entries = thread_data->pollfd.entries;
1177         int err = 0;
1178         size_t i;
1179
1180         for (i = 0; i < rec->index_map_cnt; i++) {
1181                 int e_pos = rec->index_map[i].evlist_pollfd_index;
1182                 int t_pos = rec->index_map[i].thread_pollfd_index;
1183
1184                 if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1185                     e_entries[e_pos].events != t_entries[t_pos].events) {
1186                         pr_err("Thread and evlist pollfd index mismatch\n");
1187                         err = -EINVAL;
1188                         continue;
1189                 }
1190                 e_entries[e_pos].revents = t_entries[t_pos].revents;
1191         }
1192         return err;
1193 }
1194
1195 static int record__dup_non_perf_events(struct record *rec,
1196                                        struct evlist *evlist,
1197                                        struct record_thread *thread_data)
1198 {
1199         struct fdarray *fda = &evlist->core.pollfd;
1200         int i, ret;
1201
1202         for (i = 0; i < fda->nr; i++) {
1203                 if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1204                         continue;
1205                 ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1206                 if (ret < 0) {
1207                         pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1208                         return ret;
1209                 }
1210                 pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1211                           thread_data, ret, fda->entries[i].fd);
1212                 ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1213                 if (ret < 0) {
1214                         pr_err("Failed to map thread and evlist pollfd indexes\n");
1215                         return ret;
1216                 }
1217         }
1218         return 0;
1219 }
1220
1221 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1222 {
1223         int t, ret;
1224         struct record_thread *thread_data;
1225
1226         rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1227         if (!rec->thread_data) {
1228                 pr_err("Failed to allocate thread data\n");
1229                 return -ENOMEM;
1230         }
1231         thread_data = rec->thread_data;
1232
1233         for (t = 0; t < rec->nr_threads; t++)
1234                 record__thread_data_init_pipes(&thread_data[t]);
1235
1236         for (t = 0; t < rec->nr_threads; t++) {
1237                 thread_data[t].rec = rec;
1238                 thread_data[t].mask = &rec->thread_masks[t];
1239                 ret = record__thread_data_init_maps(&thread_data[t], evlist);
1240                 if (ret) {
1241                         pr_err("Failed to initialize thread[%d] maps\n", t);
1242                         goto out_free;
1243                 }
1244                 ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1245                 if (ret) {
1246                         pr_err("Failed to initialize thread[%d] pollfd\n", t);
1247                         goto out_free;
1248                 }
1249                 if (t) {
1250                         thread_data[t].tid = -1;
1251                         ret = record__thread_data_open_pipes(&thread_data[t]);
1252                         if (ret) {
1253                                 pr_err("Failed to open thread[%d] communication pipes\n", t);
1254                                 goto out_free;
1255                         }
1256                         ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1257                                            POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1258                         if (ret < 0) {
1259                                 pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1260                                 goto out_free;
1261                         }
1262                         thread_data[t].ctlfd_pos = ret;
1263                         pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1264                                  thread_data, thread_data[t].ctlfd_pos,
1265                                  thread_data[t].pipes.msg[0]);
1266                 } else {
1267                         thread_data[t].tid = gettid();
1268
1269                         ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1270                         if (ret < 0)
1271                                 goto out_free;
1272
1273                         thread_data[t].ctlfd_pos = -1; /* Not used */
1274                 }
1275         }
1276
1277         return 0;
1278
1279 out_free:
1280         record__free_thread_data(rec);
1281
1282         return ret;
1283 }
1284
1285 static int record__mmap_evlist(struct record *rec,
1286                                struct evlist *evlist)
1287 {
1288         int i, ret;
1289         struct record_opts *opts = &rec->opts;
1290         bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1291                                   opts->auxtrace_sample_mode;
1292         char msg[512];
1293
1294         if (opts->affinity != PERF_AFFINITY_SYS)
1295                 cpu__setup_cpunode_map();
1296
1297         if (evlist__mmap_ex(evlist, opts->mmap_pages,
1298                                  opts->auxtrace_mmap_pages,
1299                                  auxtrace_overwrite,
1300                                  opts->nr_cblocks, opts->affinity,
1301                                  opts->mmap_flush, opts->comp_level) < 0) {
1302                 if (errno == EPERM) {
1303                         pr_err("Permission error mapping pages.\n"
1304                                "Consider increasing "
1305                                "/proc/sys/kernel/perf_event_mlock_kb,\n"
1306                                "or try again with a smaller value of -m/--mmap_pages.\n"
1307                                "(current value: %u,%u)\n",
1308                                opts->mmap_pages, opts->auxtrace_mmap_pages);
1309                         return -errno;
1310                 } else {
1311                         pr_err("failed to mmap with %d (%s)\n", errno,
1312                                 str_error_r(errno, msg, sizeof(msg)));
1313                         if (errno)
1314                                 return -errno;
1315                         else
1316                                 return -EINVAL;
1317                 }
1318         }
1319
1320         if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1321                 return -1;
1322
1323         ret = record__alloc_thread_data(rec, evlist);
1324         if (ret)
1325                 return ret;
1326
1327         if (record__threads_enabled(rec)) {
1328                 ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1329                 if (ret) {
1330                         pr_err("Failed to create data directory: %s\n", strerror(-ret));
1331                         return ret;
1332                 }
1333                 for (i = 0; i < evlist->core.nr_mmaps; i++) {
1334                         if (evlist->mmap)
1335                                 evlist->mmap[i].file = &rec->data.dir.files[i];
1336                         if (evlist->overwrite_mmap)
1337                                 evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1338                 }
1339         }
1340
1341         return 0;
1342 }
1343
1344 static int record__mmap(struct record *rec)
1345 {
1346         return record__mmap_evlist(rec, rec->evlist);
1347 }
1348
1349 static int record__open(struct record *rec)
1350 {
1351         char msg[BUFSIZ];
1352         struct evsel *pos;
1353         struct evlist *evlist = rec->evlist;
1354         struct perf_session *session = rec->session;
1355         struct record_opts *opts = &rec->opts;
1356         int rc = 0;
1357
1358         evlist__config(evlist, opts, &callchain_param);
1359
1360         evlist__for_each_entry(evlist, pos) {
1361 try_again:
1362                 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1363                         if (evsel__fallback(pos, &opts->target, errno, msg, sizeof(msg))) {
1364                                 if (verbose > 0)
1365                                         ui__warning("%s\n", msg);
1366                                 goto try_again;
1367                         }
1368                         if ((errno == EINVAL || errno == EBADF) &&
1369                             pos->core.leader != &pos->core &&
1370                             pos->weak_group) {
1371                                 pos = evlist__reset_weak_group(evlist, pos, true);
1372                                 goto try_again;
1373                         }
1374                         rc = -errno;
1375                         evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1376                         ui__error("%s\n", msg);
1377                         goto out;
1378                 }
1379
1380                 pos->supported = true;
1381         }
1382
1383         if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1384                 pr_warning(
1385 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1386 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1387 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1388 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1389 "Samples in kernel modules won't be resolved at all.\n\n"
1390 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1391 "even with a suitable vmlinux or kallsyms file.\n\n");
1392         }
1393
1394         if (evlist__apply_filters(evlist, &pos)) {
1395                 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1396                         pos->filter ?: "BPF", evsel__name(pos), errno,
1397                         str_error_r(errno, msg, sizeof(msg)));
1398                 rc = -1;
1399                 goto out;
1400         }
1401
1402         rc = record__mmap(rec);
1403         if (rc)
1404                 goto out;
1405
1406         session->evlist = evlist;
1407         perf_session__set_id_hdr_size(session);
1408 out:
1409         return rc;
1410 }
1411
1412 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1413 {
1414         if (rec->evlist->first_sample_time == 0)
1415                 rec->evlist->first_sample_time = sample_time;
1416
1417         if (sample_time)
1418                 rec->evlist->last_sample_time = sample_time;
1419 }
1420
1421 static int process_sample_event(struct perf_tool *tool,
1422                                 union perf_event *event,
1423                                 struct perf_sample *sample,
1424                                 struct evsel *evsel,
1425                                 struct machine *machine)
1426 {
1427         struct record *rec = container_of(tool, struct record, tool);
1428
1429         set_timestamp_boundary(rec, sample->time);
1430
1431         if (rec->buildid_all)
1432                 return 0;
1433
1434         rec->samples++;
1435         return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1436 }
1437
1438 static int process_buildids(struct record *rec)
1439 {
1440         struct perf_session *session = rec->session;
1441
1442         if (perf_data__size(&rec->data) == 0)
1443                 return 0;
1444
1445         /*
1446          * During this process, it'll load kernel map and replace the
1447          * dso->long_name to a real pathname it found.  In this case
1448          * we prefer the vmlinux path like
1449          *   /lib/modules/3.16.4/build/vmlinux
1450          *
1451          * rather than build-id path (in debug directory).
1452          *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1453          */
1454         symbol_conf.ignore_vmlinux_buildid = true;
1455
1456         /*
1457          * If --buildid-all is given, it marks all DSO regardless of hits,
1458          * so no need to process samples. But if timestamp_boundary is enabled,
1459          * it still needs to walk on all samples to get the timestamps of
1460          * first/last samples.
1461          */
1462         if (rec->buildid_all && !rec->timestamp_boundary)
1463                 rec->tool.sample = NULL;
1464
1465         return perf_session__process_events(session);
1466 }
1467
1468 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1469 {
1470         int err;
1471         struct perf_tool *tool = data;
1472         /*
1473          *As for guest kernel when processing subcommand record&report,
1474          *we arrange module mmap prior to guest kernel mmap and trigger
1475          *a preload dso because default guest module symbols are loaded
1476          *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1477          *method is used to avoid symbol missing when the first addr is
1478          *in module instead of in guest kernel.
1479          */
1480         err = perf_event__synthesize_modules(tool, process_synthesized_event,
1481                                              machine);
1482         if (err < 0)
1483                 pr_err("Couldn't record guest kernel [%d]'s reference"
1484                        " relocation symbol.\n", machine->pid);
1485
1486         /*
1487          * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1488          * have no _text sometimes.
1489          */
1490         err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1491                                                  machine);
1492         if (err < 0)
1493                 pr_err("Couldn't record guest kernel [%d]'s reference"
1494                        " relocation symbol.\n", machine->pid);
1495 }
1496
1497 static struct perf_event_header finished_round_event = {
1498         .size = sizeof(struct perf_event_header),
1499         .type = PERF_RECORD_FINISHED_ROUND,
1500 };
1501
1502 static struct perf_event_header finished_init_event = {
1503         .size = sizeof(struct perf_event_header),
1504         .type = PERF_RECORD_FINISHED_INIT,
1505 };
1506
1507 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1508 {
1509         if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1510             !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1511                           thread->mask->affinity.nbits)) {
1512                 bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1513                 bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1514                           map->affinity_mask.bits, thread->mask->affinity.nbits);
1515                 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1516                                         (cpu_set_t *)thread->mask->affinity.bits);
1517                 if (verbose == 2) {
1518                         pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1519                         mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1520                 }
1521         }
1522 }
1523
1524 static size_t process_comp_header(void *record, size_t increment)
1525 {
1526         struct perf_record_compressed *event = record;
1527         size_t size = sizeof(*event);
1528
1529         if (increment) {
1530                 event->header.size += increment;
1531                 return increment;
1532         }
1533
1534         event->header.type = PERF_RECORD_COMPRESSED;
1535         event->header.size = size;
1536
1537         return size;
1538 }
1539
1540 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
1541                             void *dst, size_t dst_size, void *src, size_t src_size)
1542 {
1543         ssize_t compressed;
1544         size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1545         struct zstd_data *zstd_data = &session->zstd_data;
1546
1547         if (map && map->file)
1548                 zstd_data = &map->zstd_data;
1549
1550         compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1551                                                      max_record_size, process_comp_header);
1552         if (compressed < 0)
1553                 return compressed;
1554
1555         if (map && map->file) {
1556                 thread->bytes_transferred += src_size;
1557                 thread->bytes_compressed  += compressed;
1558         } else {
1559                 session->bytes_transferred += src_size;
1560                 session->bytes_compressed  += compressed;
1561         }
1562
1563         return compressed;
1564 }
1565
1566 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1567                                     bool overwrite, bool synch)
1568 {
1569         u64 bytes_written = rec->bytes_written;
1570         int i;
1571         int rc = 0;
1572         int nr_mmaps;
1573         struct mmap **maps;
1574         int trace_fd = rec->data.file.fd;
1575         off_t off = 0;
1576
1577         if (!evlist)
1578                 return 0;
1579
1580         nr_mmaps = thread->nr_mmaps;
1581         maps = overwrite ? thread->overwrite_maps : thread->maps;
1582
1583         if (!maps)
1584                 return 0;
1585
1586         if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1587                 return 0;
1588
1589         if (record__aio_enabled(rec))
1590                 off = record__aio_get_pos(trace_fd);
1591
1592         for (i = 0; i < nr_mmaps; i++) {
1593                 u64 flush = 0;
1594                 struct mmap *map = maps[i];
1595
1596                 if (map->core.base) {
1597                         record__adjust_affinity(rec, map);
1598                         if (synch) {
1599                                 flush = map->core.flush;
1600                                 map->core.flush = 1;
1601                         }
1602                         if (!record__aio_enabled(rec)) {
1603                                 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1604                                         if (synch)
1605                                                 map->core.flush = flush;
1606                                         rc = -1;
1607                                         goto out;
1608                                 }
1609                         } else {
1610                                 if (record__aio_push(rec, map, &off) < 0) {
1611                                         record__aio_set_pos(trace_fd, off);
1612                                         if (synch)
1613                                                 map->core.flush = flush;
1614                                         rc = -1;
1615                                         goto out;
1616                                 }
1617                         }
1618                         if (synch)
1619                                 map->core.flush = flush;
1620                 }
1621
1622                 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1623                     !rec->opts.auxtrace_sample_mode &&
1624                     record__auxtrace_mmap_read(rec, map) != 0) {
1625                         rc = -1;
1626                         goto out;
1627                 }
1628         }
1629
1630         if (record__aio_enabled(rec))
1631                 record__aio_set_pos(trace_fd, off);
1632
1633         /*
1634          * Mark the round finished in case we wrote
1635          * at least one event.
1636          *
1637          * No need for round events in directory mode,
1638          * because per-cpu maps and files have data
1639          * sorted by kernel.
1640          */
1641         if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1642                 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1643
1644         if (overwrite)
1645                 evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1646 out:
1647         return rc;
1648 }
1649
1650 static int record__mmap_read_all(struct record *rec, bool synch)
1651 {
1652         int err;
1653
1654         err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1655         if (err)
1656                 return err;
1657
1658         return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1659 }
1660
1661 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1662                                            void *arg __maybe_unused)
1663 {
1664         struct perf_mmap *map = fda->priv[fd].ptr;
1665
1666         if (map)
1667                 perf_mmap__put(map);
1668 }
1669
1670 static void *record__thread(void *arg)
1671 {
1672         enum thread_msg msg = THREAD_MSG__READY;
1673         bool terminate = false;
1674         struct fdarray *pollfd;
1675         int err, ctlfd_pos;
1676
1677         thread = arg;
1678         thread->tid = gettid();
1679
1680         err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1681         if (err == -1)
1682                 pr_warning("threads[%d]: failed to notify on start: %s\n",
1683                            thread->tid, strerror(errno));
1684
1685         pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1686
1687         pollfd = &thread->pollfd;
1688         ctlfd_pos = thread->ctlfd_pos;
1689
1690         for (;;) {
1691                 unsigned long long hits = thread->samples;
1692
1693                 if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1694                         break;
1695
1696                 if (hits == thread->samples) {
1697
1698                         err = fdarray__poll(pollfd, -1);
1699                         /*
1700                          * Propagate error, only if there's any. Ignore positive
1701                          * number of returned events and interrupt error.
1702                          */
1703                         if (err > 0 || (err < 0 && errno == EINTR))
1704                                 err = 0;
1705                         thread->waking++;
1706
1707                         if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1708                                             record__thread_munmap_filtered, NULL) == 0)
1709                                 break;
1710                 }
1711
1712                 if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1713                         terminate = true;
1714                         close(thread->pipes.msg[0]);
1715                         thread->pipes.msg[0] = -1;
1716                         pollfd->entries[ctlfd_pos].fd = -1;
1717                         pollfd->entries[ctlfd_pos].events = 0;
1718                 }
1719
1720                 pollfd->entries[ctlfd_pos].revents = 0;
1721         }
1722         record__mmap_read_all(thread->rec, true);
1723
1724         err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1725         if (err == -1)
1726                 pr_warning("threads[%d]: failed to notify on termination: %s\n",
1727                            thread->tid, strerror(errno));
1728
1729         return NULL;
1730 }
1731
1732 static void record__init_features(struct record *rec)
1733 {
1734         struct perf_session *session = rec->session;
1735         int feat;
1736
1737         for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1738                 perf_header__set_feat(&session->header, feat);
1739
1740         if (rec->no_buildid)
1741                 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1742
1743 #ifdef HAVE_LIBTRACEEVENT
1744         if (!have_tracepoints(&rec->evlist->core.entries))
1745                 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1746 #endif
1747
1748         if (!rec->opts.branch_stack)
1749                 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1750
1751         if (!rec->opts.full_auxtrace)
1752                 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1753
1754         if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1755                 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1756
1757         if (!rec->opts.use_clockid)
1758                 perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1759
1760         if (!record__threads_enabled(rec))
1761                 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1762
1763         if (!record__comp_enabled(rec))
1764                 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1765
1766         perf_header__clear_feat(&session->header, HEADER_STAT);
1767 }
1768
1769 static void
1770 record__finish_output(struct record *rec)
1771 {
1772         int i;
1773         struct perf_data *data = &rec->data;
1774         int fd = perf_data__fd(data);
1775
1776         if (data->is_pipe)
1777                 return;
1778
1779         rec->session->header.data_size += rec->bytes_written;
1780         data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1781         if (record__threads_enabled(rec)) {
1782                 for (i = 0; i < data->dir.nr; i++)
1783                         data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1784         }
1785
1786         if (!rec->no_buildid) {
1787                 process_buildids(rec);
1788
1789                 if (rec->buildid_all)
1790                         dsos__hit_all(rec->session);
1791         }
1792         perf_session__write_header(rec->session, rec->evlist, fd, true);
1793
1794         return;
1795 }
1796
1797 static int record__synthesize_workload(struct record *rec, bool tail)
1798 {
1799         int err;
1800         struct perf_thread_map *thread_map;
1801         bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1802
1803         if (rec->opts.tail_synthesize != tail)
1804                 return 0;
1805
1806         thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1807         if (thread_map == NULL)
1808                 return -1;
1809
1810         err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1811                                                  process_synthesized_event,
1812                                                  &rec->session->machines.host,
1813                                                  needs_mmap,
1814                                                  rec->opts.sample_address);
1815         perf_thread_map__put(thread_map);
1816         return err;
1817 }
1818
1819 static int write_finished_init(struct record *rec, bool tail)
1820 {
1821         if (rec->opts.tail_synthesize != tail)
1822                 return 0;
1823
1824         return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1825 }
1826
1827 static int record__synthesize(struct record *rec, bool tail);
1828
1829 static int
1830 record__switch_output(struct record *rec, bool at_exit)
1831 {
1832         struct perf_data *data = &rec->data;
1833         char *new_filename = NULL;
1834         int fd, err;
1835
1836         /* Same Size:      "2015122520103046"*/
1837         char timestamp[] = "InvalidTimestamp";
1838
1839         record__aio_mmap_read_sync(rec);
1840
1841         write_finished_init(rec, true);
1842
1843         record__synthesize(rec, true);
1844         if (target__none(&rec->opts.target))
1845                 record__synthesize_workload(rec, true);
1846
1847         rec->samples = 0;
1848         record__finish_output(rec);
1849         err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1850         if (err) {
1851                 pr_err("Failed to get current timestamp\n");
1852                 return -EINVAL;
1853         }
1854
1855         fd = perf_data__switch(data, timestamp,
1856                                     rec->session->header.data_offset,
1857                                     at_exit, &new_filename);
1858         if (fd >= 0 && !at_exit) {
1859                 rec->bytes_written = 0;
1860                 rec->session->header.data_size = 0;
1861         }
1862
1863         if (!quiet)
1864                 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1865                         data->path, timestamp);
1866
1867         if (rec->switch_output.num_files) {
1868                 int n = rec->switch_output.cur_file + 1;
1869
1870                 if (n >= rec->switch_output.num_files)
1871                         n = 0;
1872                 rec->switch_output.cur_file = n;
1873                 if (rec->switch_output.filenames[n]) {
1874                         remove(rec->switch_output.filenames[n]);
1875                         zfree(&rec->switch_output.filenames[n]);
1876                 }
1877                 rec->switch_output.filenames[n] = new_filename;
1878         } else {
1879                 free(new_filename);
1880         }
1881
1882         /* Output tracking events */
1883         if (!at_exit) {
1884                 record__synthesize(rec, false);
1885
1886                 /*
1887                  * In 'perf record --switch-output' without -a,
1888                  * record__synthesize() in record__switch_output() won't
1889                  * generate tracking events because there's no thread_map
1890                  * in evlist. Which causes newly created perf.data doesn't
1891                  * contain map and comm information.
1892                  * Create a fake thread_map and directly call
1893                  * perf_event__synthesize_thread_map() for those events.
1894                  */
1895                 if (target__none(&rec->opts.target))
1896                         record__synthesize_workload(rec, false);
1897                 write_finished_init(rec, false);
1898         }
1899         return fd;
1900 }
1901
1902 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1903                                         struct perf_record_lost_samples *lost,
1904                                         int cpu_idx, int thread_idx, u64 lost_count,
1905                                         u16 misc_flag)
1906 {
1907         struct perf_sample_id *sid;
1908         struct perf_sample sample = {};
1909         int id_hdr_size;
1910
1911         lost->lost = lost_count;
1912         if (evsel->core.ids) {
1913                 sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1914                 sample.id = sid->id;
1915         }
1916
1917         id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1918                                                        evsel->core.attr.sample_type, &sample);
1919         lost->header.size = sizeof(*lost) + id_hdr_size;
1920         lost->header.misc = misc_flag;
1921         record__write(rec, NULL, lost, lost->header.size);
1922 }
1923
1924 static void record__read_lost_samples(struct record *rec)
1925 {
1926         struct perf_session *session = rec->session;
1927         struct perf_record_lost_samples *lost = NULL;
1928         struct evsel *evsel;
1929
1930         /* there was an error during record__open */
1931         if (session->evlist == NULL)
1932                 return;
1933
1934         evlist__for_each_entry(session->evlist, evsel) {
1935                 struct xyarray *xy = evsel->core.sample_id;
1936                 u64 lost_count;
1937
1938                 if (xy == NULL || evsel->core.fd == NULL)
1939                         continue;
1940                 if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1941                     xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1942                         pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1943                         continue;
1944                 }
1945
1946                 for (int x = 0; x < xyarray__max_x(xy); x++) {
1947                         for (int y = 0; y < xyarray__max_y(xy); y++) {
1948                                 struct perf_counts_values count;
1949
1950                                 if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
1951                                         pr_debug("read LOST count failed\n");
1952                                         goto out;
1953                                 }
1954
1955                                 if (count.lost) {
1956                                         if (!lost) {
1957                                                 lost = zalloc(sizeof(*lost) +
1958                                                               session->machines.host.id_hdr_size);
1959                                                 if (!lost) {
1960                                                         pr_debug("Memory allocation failed\n");
1961                                                         return;
1962                                                 }
1963                                                 lost->header.type = PERF_RECORD_LOST_SAMPLES;
1964                                         }
1965                                         __record__save_lost_samples(rec, evsel, lost,
1966                                                                     x, y, count.lost, 0);
1967                                 }
1968                         }
1969                 }
1970
1971                 lost_count = perf_bpf_filter__lost_count(evsel);
1972                 if (lost_count) {
1973                         if (!lost) {
1974                                 lost = zalloc(sizeof(*lost) +
1975                                               session->machines.host.id_hdr_size);
1976                                 if (!lost) {
1977                                         pr_debug("Memory allocation failed\n");
1978                                         return;
1979                                 }
1980                                 lost->header.type = PERF_RECORD_LOST_SAMPLES;
1981                         }
1982                         __record__save_lost_samples(rec, evsel, lost, 0, 0, lost_count,
1983                                                     PERF_RECORD_MISC_LOST_SAMPLES_BPF);
1984                 }
1985         }
1986 out:
1987         free(lost);
1988 }
1989
1990 static volatile sig_atomic_t workload_exec_errno;
1991
1992 /*
1993  * evlist__prepare_workload will send a SIGUSR1
1994  * if the fork fails, since we asked by setting its
1995  * want_signal to true.
1996  */
1997 static void workload_exec_failed_signal(int signo __maybe_unused,
1998                                         siginfo_t *info,
1999                                         void *ucontext __maybe_unused)
2000 {
2001         workload_exec_errno = info->si_value.sival_int;
2002         done = 1;
2003         child_finished = 1;
2004 }
2005
2006 static void snapshot_sig_handler(int sig);
2007 static void alarm_sig_handler(int sig);
2008
2009 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
2010 {
2011         if (evlist) {
2012                 if (evlist->mmap && evlist->mmap[0].core.base)
2013                         return evlist->mmap[0].core.base;
2014                 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
2015                         return evlist->overwrite_mmap[0].core.base;
2016         }
2017         return NULL;
2018 }
2019
2020 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
2021 {
2022         const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
2023         if (pc)
2024                 return pc;
2025         return NULL;
2026 }
2027
2028 static int record__synthesize(struct record *rec, bool tail)
2029 {
2030         struct perf_session *session = rec->session;
2031         struct machine *machine = &session->machines.host;
2032         struct perf_data *data = &rec->data;
2033         struct record_opts *opts = &rec->opts;
2034         struct perf_tool *tool = &rec->tool;
2035         int err = 0;
2036         event_op f = process_synthesized_event;
2037
2038         if (rec->opts.tail_synthesize != tail)
2039                 return 0;
2040
2041         if (data->is_pipe) {
2042                 err = perf_event__synthesize_for_pipe(tool, session, data,
2043                                                       process_synthesized_event);
2044                 if (err < 0)
2045                         goto out;
2046
2047                 rec->bytes_written += err;
2048         }
2049
2050         err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
2051                                           process_synthesized_event, machine);
2052         if (err)
2053                 goto out;
2054
2055         /* Synthesize id_index before auxtrace_info */
2056         err = perf_event__synthesize_id_index(tool,
2057                                               process_synthesized_event,
2058                                               session->evlist, machine);
2059         if (err)
2060                 goto out;
2061
2062         if (rec->opts.full_auxtrace) {
2063                 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2064                                         session, process_synthesized_event);
2065                 if (err)
2066                         goto out;
2067         }
2068
2069         if (!evlist__exclude_kernel(rec->evlist)) {
2070                 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2071                                                          machine);
2072                 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2073                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2074                                    "Check /proc/kallsyms permission or run as root.\n");
2075
2076                 err = perf_event__synthesize_modules(tool, process_synthesized_event,
2077                                                      machine);
2078                 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2079                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2080                                    "Check /proc/modules permission or run as root.\n");
2081         }
2082
2083         if (perf_guest) {
2084                 machines__process_guests(&session->machines,
2085                                          perf_event__synthesize_guest_os, tool);
2086         }
2087
2088         err = perf_event__synthesize_extra_attr(&rec->tool,
2089                                                 rec->evlist,
2090                                                 process_synthesized_event,
2091                                                 data->is_pipe);
2092         if (err)
2093                 goto out;
2094
2095         err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2096                                                  process_synthesized_event,
2097                                                 NULL);
2098         if (err < 0) {
2099                 pr_err("Couldn't synthesize thread map.\n");
2100                 return err;
2101         }
2102
2103         err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2104                                              process_synthesized_event, NULL);
2105         if (err < 0) {
2106                 pr_err("Couldn't synthesize cpu map.\n");
2107                 return err;
2108         }
2109
2110         err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2111                                                 machine, opts);
2112         if (err < 0) {
2113                 pr_warning("Couldn't synthesize bpf events.\n");
2114                 err = 0;
2115         }
2116
2117         if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2118                 err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2119                                                      machine);
2120                 if (err < 0) {
2121                         pr_warning("Couldn't synthesize cgroup events.\n");
2122                         err = 0;
2123                 }
2124         }
2125
2126         if (rec->opts.nr_threads_synthesize > 1) {
2127                 mutex_init(&synth_lock);
2128                 perf_set_multithreaded();
2129                 f = process_locked_synthesized_event;
2130         }
2131
2132         if (rec->opts.synth & PERF_SYNTH_TASK) {
2133                 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2134
2135                 err = __machine__synthesize_threads(machine, tool, &opts->target,
2136                                                     rec->evlist->core.threads,
2137                                                     f, needs_mmap, opts->sample_address,
2138                                                     rec->opts.nr_threads_synthesize);
2139         }
2140
2141         if (rec->opts.nr_threads_synthesize > 1) {
2142                 perf_set_singlethreaded();
2143                 mutex_destroy(&synth_lock);
2144         }
2145
2146 out:
2147         return err;
2148 }
2149
2150 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2151 {
2152         struct record *rec = data;
2153         pthread_kill(rec->thread_id, SIGUSR2);
2154         return 0;
2155 }
2156
2157 static int record__setup_sb_evlist(struct record *rec)
2158 {
2159         struct record_opts *opts = &rec->opts;
2160
2161         if (rec->sb_evlist != NULL) {
2162                 /*
2163                  * We get here if --switch-output-event populated the
2164                  * sb_evlist, so associate a callback that will send a SIGUSR2
2165                  * to the main thread.
2166                  */
2167                 evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2168                 rec->thread_id = pthread_self();
2169         }
2170 #ifdef HAVE_LIBBPF_SUPPORT
2171         if (!opts->no_bpf_event) {
2172                 if (rec->sb_evlist == NULL) {
2173                         rec->sb_evlist = evlist__new();
2174
2175                         if (rec->sb_evlist == NULL) {
2176                                 pr_err("Couldn't create side band evlist.\n.");
2177                                 return -1;
2178                         }
2179                 }
2180
2181                 if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2182                         pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2183                         return -1;
2184                 }
2185         }
2186 #endif
2187         if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2188                 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2189                 opts->no_bpf_event = true;
2190         }
2191
2192         return 0;
2193 }
2194
2195 static int record__init_clock(struct record *rec)
2196 {
2197         struct perf_session *session = rec->session;
2198         struct timespec ref_clockid;
2199         struct timeval ref_tod;
2200         u64 ref;
2201
2202         if (!rec->opts.use_clockid)
2203                 return 0;
2204
2205         if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2206                 session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2207
2208         session->header.env.clock.clockid = rec->opts.clockid;
2209
2210         if (gettimeofday(&ref_tod, NULL) != 0) {
2211                 pr_err("gettimeofday failed, cannot set reference time.\n");
2212                 return -1;
2213         }
2214
2215         if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2216                 pr_err("clock_gettime failed, cannot set reference time.\n");
2217                 return -1;
2218         }
2219
2220         ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2221               (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2222
2223         session->header.env.clock.tod_ns = ref;
2224
2225         ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2226               (u64) ref_clockid.tv_nsec;
2227
2228         session->header.env.clock.clockid_ns = ref;
2229         return 0;
2230 }
2231
2232 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2233 {
2234         if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2235                 trigger_hit(&auxtrace_snapshot_trigger);
2236                 auxtrace_record__snapshot_started = 1;
2237                 if (auxtrace_record__snapshot_start(rec->itr))
2238                         trigger_error(&auxtrace_snapshot_trigger);
2239         }
2240 }
2241
2242 static int record__terminate_thread(struct record_thread *thread_data)
2243 {
2244         int err;
2245         enum thread_msg ack = THREAD_MSG__UNDEFINED;
2246         pid_t tid = thread_data->tid;
2247
2248         close(thread_data->pipes.msg[1]);
2249         thread_data->pipes.msg[1] = -1;
2250         err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2251         if (err > 0)
2252                 pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2253         else
2254                 pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2255                            thread->tid, tid);
2256
2257         return 0;
2258 }
2259
2260 static int record__start_threads(struct record *rec)
2261 {
2262         int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2263         struct record_thread *thread_data = rec->thread_data;
2264         sigset_t full, mask;
2265         pthread_t handle;
2266         pthread_attr_t attrs;
2267
2268         thread = &thread_data[0];
2269
2270         if (!record__threads_enabled(rec))
2271                 return 0;
2272
2273         sigfillset(&full);
2274         if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2275                 pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2276                 return -1;
2277         }
2278
2279         pthread_attr_init(&attrs);
2280         pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2281
2282         for (t = 1; t < nr_threads; t++) {
2283                 enum thread_msg msg = THREAD_MSG__UNDEFINED;
2284
2285 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2286                 pthread_attr_setaffinity_np(&attrs,
2287                                             MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2288                                             (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2289 #endif
2290                 if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2291                         for (tt = 1; tt < t; tt++)
2292                                 record__terminate_thread(&thread_data[t]);
2293                         pr_err("Failed to start threads: %s\n", strerror(errno));
2294                         ret = -1;
2295                         goto out_err;
2296                 }
2297
2298                 err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2299                 if (err > 0)
2300                         pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2301                                   thread_msg_tags[msg]);
2302                 else
2303                         pr_warning("threads[%d]: failed to receive start notification from %d\n",
2304                                    thread->tid, rec->thread_data[t].tid);
2305         }
2306
2307         sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2308                         (cpu_set_t *)thread->mask->affinity.bits);
2309
2310         pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2311
2312 out_err:
2313         pthread_attr_destroy(&attrs);
2314
2315         if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2316                 pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2317                 ret = -1;
2318         }
2319
2320         return ret;
2321 }
2322
2323 static int record__stop_threads(struct record *rec)
2324 {
2325         int t;
2326         struct record_thread *thread_data = rec->thread_data;
2327
2328         for (t = 1; t < rec->nr_threads; t++)
2329                 record__terminate_thread(&thread_data[t]);
2330
2331         for (t = 0; t < rec->nr_threads; t++) {
2332                 rec->samples += thread_data[t].samples;
2333                 if (!record__threads_enabled(rec))
2334                         continue;
2335                 rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2336                 rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2337                 pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2338                          thread_data[t].samples, thread_data[t].waking);
2339                 if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2340                         pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2341                                  thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2342                 else
2343                         pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2344         }
2345
2346         return 0;
2347 }
2348
2349 static unsigned long record__waking(struct record *rec)
2350 {
2351         int t;
2352         unsigned long waking = 0;
2353         struct record_thread *thread_data = rec->thread_data;
2354
2355         for (t = 0; t < rec->nr_threads; t++)
2356                 waking += thread_data[t].waking;
2357
2358         return waking;
2359 }
2360
2361 static int __cmd_record(struct record *rec, int argc, const char **argv)
2362 {
2363         int err;
2364         int status = 0;
2365         const bool forks = argc > 0;
2366         struct perf_tool *tool = &rec->tool;
2367         struct record_opts *opts = &rec->opts;
2368         struct perf_data *data = &rec->data;
2369         struct perf_session *session;
2370         bool disabled = false, draining = false;
2371         int fd;
2372         float ratio = 0;
2373         enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2374
2375         atexit(record__sig_exit);
2376         signal(SIGCHLD, sig_handler);
2377         signal(SIGINT, sig_handler);
2378         signal(SIGTERM, sig_handler);
2379         signal(SIGSEGV, sigsegv_handler);
2380
2381         if (rec->opts.record_namespaces)
2382                 tool->namespace_events = true;
2383
2384         if (rec->opts.record_cgroup) {
2385 #ifdef HAVE_FILE_HANDLE
2386                 tool->cgroup_events = true;
2387 #else
2388                 pr_err("cgroup tracking is not supported\n");
2389                 return -1;
2390 #endif
2391         }
2392
2393         if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2394                 signal(SIGUSR2, snapshot_sig_handler);
2395                 if (rec->opts.auxtrace_snapshot_mode)
2396                         trigger_on(&auxtrace_snapshot_trigger);
2397                 if (rec->switch_output.enabled)
2398                         trigger_on(&switch_output_trigger);
2399         } else {
2400                 signal(SIGUSR2, SIG_IGN);
2401         }
2402
2403         session = perf_session__new(data, tool);
2404         if (IS_ERR(session)) {
2405                 pr_err("Perf session creation failed.\n");
2406                 return PTR_ERR(session);
2407         }
2408
2409         if (record__threads_enabled(rec)) {
2410                 if (perf_data__is_pipe(&rec->data)) {
2411                         pr_err("Parallel trace streaming is not available in pipe mode.\n");
2412                         return -1;
2413                 }
2414                 if (rec->opts.full_auxtrace) {
2415                         pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2416                         return -1;
2417                 }
2418         }
2419
2420         fd = perf_data__fd(data);
2421         rec->session = session;
2422
2423         if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2424                 pr_err("Compression initialization failed.\n");
2425                 return -1;
2426         }
2427 #ifdef HAVE_EVENTFD_SUPPORT
2428         done_fd = eventfd(0, EFD_NONBLOCK);
2429         if (done_fd < 0) {
2430                 pr_err("Failed to create wakeup eventfd, error: %m\n");
2431                 status = -1;
2432                 goto out_delete_session;
2433         }
2434         err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2435         if (err < 0) {
2436                 pr_err("Failed to add wakeup eventfd to poll list\n");
2437                 status = err;
2438                 goto out_delete_session;
2439         }
2440 #endif // HAVE_EVENTFD_SUPPORT
2441
2442         session->header.env.comp_type  = PERF_COMP_ZSTD;
2443         session->header.env.comp_level = rec->opts.comp_level;
2444
2445         if (rec->opts.kcore &&
2446             !record__kcore_readable(&session->machines.host)) {
2447                 pr_err("ERROR: kcore is not readable.\n");
2448                 return -1;
2449         }
2450
2451         if (record__init_clock(rec))
2452                 return -1;
2453
2454         record__init_features(rec);
2455
2456         if (forks) {
2457                 err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2458                                                workload_exec_failed_signal);
2459                 if (err < 0) {
2460                         pr_err("Couldn't run the workload!\n");
2461                         status = err;
2462                         goto out_delete_session;
2463                 }
2464         }
2465
2466         /*
2467          * If we have just single event and are sending data
2468          * through pipe, we need to force the ids allocation,
2469          * because we synthesize event name through the pipe
2470          * and need the id for that.
2471          */
2472         if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2473                 rec->opts.sample_id = true;
2474
2475         if (rec->timestamp_filename && perf_data__is_pipe(data)) {
2476                 rec->timestamp_filename = false;
2477                 pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n");
2478         }
2479
2480         evlist__uniquify_name(rec->evlist);
2481
2482         /* Debug message used by test scripts */
2483         pr_debug3("perf record opening and mmapping events\n");
2484         if (record__open(rec) != 0) {
2485                 err = -1;
2486                 goto out_free_threads;
2487         }
2488         /* Debug message used by test scripts */
2489         pr_debug3("perf record done opening and mmapping events\n");
2490         session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2491
2492         if (rec->opts.kcore) {
2493                 err = record__kcore_copy(&session->machines.host, data);
2494                 if (err) {
2495                         pr_err("ERROR: Failed to copy kcore\n");
2496                         goto out_free_threads;
2497                 }
2498         }
2499
2500         /*
2501          * Normally perf_session__new would do this, but it doesn't have the
2502          * evlist.
2503          */
2504         if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2505                 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2506                 rec->tool.ordered_events = false;
2507         }
2508
2509         if (evlist__nr_groups(rec->evlist) == 0)
2510                 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2511
2512         if (data->is_pipe) {
2513                 err = perf_header__write_pipe(fd);
2514                 if (err < 0)
2515                         goto out_free_threads;
2516         } else {
2517                 err = perf_session__write_header(session, rec->evlist, fd, false);
2518                 if (err < 0)
2519                         goto out_free_threads;
2520         }
2521
2522         err = -1;
2523         if (!rec->no_buildid
2524             && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2525                 pr_err("Couldn't generate buildids. "
2526                        "Use --no-buildid to profile anyway.\n");
2527                 goto out_free_threads;
2528         }
2529
2530         err = record__setup_sb_evlist(rec);
2531         if (err)
2532                 goto out_free_threads;
2533
2534         err = record__synthesize(rec, false);
2535         if (err < 0)
2536                 goto out_free_threads;
2537
2538         if (rec->realtime_prio) {
2539                 struct sched_param param;
2540
2541                 param.sched_priority = rec->realtime_prio;
2542                 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2543                         pr_err("Could not set realtime priority.\n");
2544                         err = -1;
2545                         goto out_free_threads;
2546                 }
2547         }
2548
2549         if (record__start_threads(rec))
2550                 goto out_free_threads;
2551
2552         /*
2553          * When perf is starting the traced process, all the events
2554          * (apart from group members) have enable_on_exec=1 set,
2555          * so don't spoil it by prematurely enabling them.
2556          */
2557         if (!target__none(&opts->target) && !opts->target.initial_delay)
2558                 evlist__enable(rec->evlist);
2559
2560         /*
2561          * Let the child rip
2562          */
2563         if (forks) {
2564                 struct machine *machine = &session->machines.host;
2565                 union perf_event *event;
2566                 pid_t tgid;
2567
2568                 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2569                 if (event == NULL) {
2570                         err = -ENOMEM;
2571                         goto out_child;
2572                 }
2573
2574                 /*
2575                  * Some H/W events are generated before COMM event
2576                  * which is emitted during exec(), so perf script
2577                  * cannot see a correct process name for those events.
2578                  * Synthesize COMM event to prevent it.
2579                  */
2580                 tgid = perf_event__synthesize_comm(tool, event,
2581                                                    rec->evlist->workload.pid,
2582                                                    process_synthesized_event,
2583                                                    machine);
2584                 free(event);
2585
2586                 if (tgid == -1)
2587                         goto out_child;
2588
2589                 event = malloc(sizeof(event->namespaces) +
2590                                (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2591                                machine->id_hdr_size);
2592                 if (event == NULL) {
2593                         err = -ENOMEM;
2594                         goto out_child;
2595                 }
2596
2597                 /*
2598                  * Synthesize NAMESPACES event for the command specified.
2599                  */
2600                 perf_event__synthesize_namespaces(tool, event,
2601                                                   rec->evlist->workload.pid,
2602                                                   tgid, process_synthesized_event,
2603                                                   machine);
2604                 free(event);
2605
2606                 evlist__start_workload(rec->evlist);
2607         }
2608
2609         if (opts->target.initial_delay) {
2610                 pr_info(EVLIST_DISABLED_MSG);
2611                 if (opts->target.initial_delay > 0) {
2612                         usleep(opts->target.initial_delay * USEC_PER_MSEC);
2613                         evlist__enable(rec->evlist);
2614                         pr_info(EVLIST_ENABLED_MSG);
2615                 }
2616         }
2617
2618         err = event_enable_timer__start(rec->evlist->eet);
2619         if (err)
2620                 goto out_child;
2621
2622         /* Debug message used by test scripts */
2623         pr_debug3("perf record has started\n");
2624         fflush(stderr);
2625
2626         trigger_ready(&auxtrace_snapshot_trigger);
2627         trigger_ready(&switch_output_trigger);
2628         perf_hooks__invoke_record_start();
2629
2630         /*
2631          * Must write FINISHED_INIT so it will be seen after all other
2632          * synthesized user events, but before any regular events.
2633          */
2634         err = write_finished_init(rec, false);
2635         if (err < 0)
2636                 goto out_child;
2637
2638         for (;;) {
2639                 unsigned long long hits = thread->samples;
2640
2641                 /*
2642                  * rec->evlist->bkw_mmap_state is possible to be
2643                  * BKW_MMAP_EMPTY here: when done == true and
2644                  * hits != rec->samples in previous round.
2645                  *
2646                  * evlist__toggle_bkw_mmap ensure we never
2647                  * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2648                  */
2649                 if (trigger_is_hit(&switch_output_trigger) || done || draining)
2650                         evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2651
2652                 if (record__mmap_read_all(rec, false) < 0) {
2653                         trigger_error(&auxtrace_snapshot_trigger);
2654                         trigger_error(&switch_output_trigger);
2655                         err = -1;
2656                         goto out_child;
2657                 }
2658
2659                 if (auxtrace_record__snapshot_started) {
2660                         auxtrace_record__snapshot_started = 0;
2661                         if (!trigger_is_error(&auxtrace_snapshot_trigger))
2662                                 record__read_auxtrace_snapshot(rec, false);
2663                         if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2664                                 pr_err("AUX area tracing snapshot failed\n");
2665                                 err = -1;
2666                                 goto out_child;
2667                         }
2668                 }
2669
2670                 if (trigger_is_hit(&switch_output_trigger)) {
2671                         /*
2672                          * If switch_output_trigger is hit, the data in
2673                          * overwritable ring buffer should have been collected,
2674                          * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2675                          *
2676                          * If SIGUSR2 raise after or during record__mmap_read_all(),
2677                          * record__mmap_read_all() didn't collect data from
2678                          * overwritable ring buffer. Read again.
2679                          */
2680                         if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2681                                 continue;
2682                         trigger_ready(&switch_output_trigger);
2683
2684                         /*
2685                          * Reenable events in overwrite ring buffer after
2686                          * record__mmap_read_all(): we should have collected
2687                          * data from it.
2688                          */
2689                         evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2690
2691                         if (!quiet)
2692                                 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2693                                         record__waking(rec));
2694                         thread->waking = 0;
2695                         fd = record__switch_output(rec, false);
2696                         if (fd < 0) {
2697                                 pr_err("Failed to switch to new file\n");
2698                                 trigger_error(&switch_output_trigger);
2699                                 err = fd;
2700                                 goto out_child;
2701                         }
2702
2703                         /* re-arm the alarm */
2704                         if (rec->switch_output.time)
2705                                 alarm(rec->switch_output.time);
2706                 }
2707
2708                 if (hits == thread->samples) {
2709                         if (done || draining)
2710                                 break;
2711                         err = fdarray__poll(&thread->pollfd, -1);
2712                         /*
2713                          * Propagate error, only if there's any. Ignore positive
2714                          * number of returned events and interrupt error.
2715                          */
2716                         if (err > 0 || (err < 0 && errno == EINTR))
2717                                 err = 0;
2718                         thread->waking++;
2719
2720                         if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2721                                             record__thread_munmap_filtered, NULL) == 0)
2722                                 draining = true;
2723
2724                         err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2725                         if (err)
2726                                 goto out_child;
2727                 }
2728
2729                 if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2730                         switch (cmd) {
2731                         case EVLIST_CTL_CMD_SNAPSHOT:
2732                                 hit_auxtrace_snapshot_trigger(rec);
2733                                 evlist__ctlfd_ack(rec->evlist);
2734                                 break;
2735                         case EVLIST_CTL_CMD_STOP:
2736                                 done = 1;
2737                                 break;
2738                         case EVLIST_CTL_CMD_ACK:
2739                         case EVLIST_CTL_CMD_UNSUPPORTED:
2740                         case EVLIST_CTL_CMD_ENABLE:
2741                         case EVLIST_CTL_CMD_DISABLE:
2742                         case EVLIST_CTL_CMD_EVLIST:
2743                         case EVLIST_CTL_CMD_PING:
2744                         default:
2745                                 break;
2746                         }
2747                 }
2748
2749                 err = event_enable_timer__process(rec->evlist->eet);
2750                 if (err < 0)
2751                         goto out_child;
2752                 if (err) {
2753                         err = 0;
2754                         done = 1;
2755                 }
2756
2757                 /*
2758                  * When perf is starting the traced process, at the end events
2759                  * die with the process and we wait for that. Thus no need to
2760                  * disable events in this case.
2761                  */
2762                 if (done && !disabled && !target__none(&opts->target)) {
2763                         trigger_off(&auxtrace_snapshot_trigger);
2764                         evlist__disable(rec->evlist);
2765                         disabled = true;
2766                 }
2767         }
2768
2769         trigger_off(&auxtrace_snapshot_trigger);
2770         trigger_off(&switch_output_trigger);
2771
2772         if (opts->auxtrace_snapshot_on_exit)
2773                 record__auxtrace_snapshot_exit(rec);
2774
2775         if (forks && workload_exec_errno) {
2776                 char msg[STRERR_BUFSIZE], strevsels[2048];
2777                 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2778
2779                 evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2780
2781                 pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2782                         strevsels, argv[0], emsg);
2783                 err = -1;
2784                 goto out_child;
2785         }
2786
2787         if (!quiet)
2788                 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2789                         record__waking(rec));
2790
2791         write_finished_init(rec, true);
2792
2793         if (target__none(&rec->opts.target))
2794                 record__synthesize_workload(rec, true);
2795
2796 out_child:
2797         record__stop_threads(rec);
2798         record__mmap_read_all(rec, true);
2799 out_free_threads:
2800         record__free_thread_data(rec);
2801         evlist__finalize_ctlfd(rec->evlist);
2802         record__aio_mmap_read_sync(rec);
2803
2804         if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2805                 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2806                 session->header.env.comp_ratio = ratio + 0.5;
2807         }
2808
2809         if (forks) {
2810                 int exit_status;
2811
2812                 if (!child_finished)
2813                         kill(rec->evlist->workload.pid, SIGTERM);
2814
2815                 wait(&exit_status);
2816
2817                 if (err < 0)
2818                         status = err;
2819                 else if (WIFEXITED(exit_status))
2820                         status = WEXITSTATUS(exit_status);
2821                 else if (WIFSIGNALED(exit_status))
2822                         signr = WTERMSIG(exit_status);
2823         } else
2824                 status = err;
2825
2826         if (rec->off_cpu)
2827                 rec->bytes_written += off_cpu_write(rec->session);
2828
2829         record__read_lost_samples(rec);
2830         record__synthesize(rec, true);
2831         /* this will be recalculated during process_buildids() */
2832         rec->samples = 0;
2833
2834         if (!err) {
2835                 if (!rec->timestamp_filename) {
2836                         record__finish_output(rec);
2837                 } else {
2838                         fd = record__switch_output(rec, true);
2839                         if (fd < 0) {
2840                                 status = fd;
2841                                 goto out_delete_session;
2842                         }
2843                 }
2844         }
2845
2846         perf_hooks__invoke_record_end();
2847
2848         if (!err && !quiet) {
2849                 char samples[128];
2850                 const char *postfix = rec->timestamp_filename ?
2851                                         ".<timestamp>" : "";
2852
2853                 if (rec->samples && !rec->opts.full_auxtrace)
2854                         scnprintf(samples, sizeof(samples),
2855                                   " (%" PRIu64 " samples)", rec->samples);
2856                 else
2857                         samples[0] = '\0';
2858
2859                 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
2860                         perf_data__size(data) / 1024.0 / 1024.0,
2861                         data->path, postfix, samples);
2862                 if (ratio) {
2863                         fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
2864                                         rec->session->bytes_transferred / 1024.0 / 1024.0,
2865                                         ratio);
2866                 }
2867                 fprintf(stderr, " ]\n");
2868         }
2869
2870 out_delete_session:
2871 #ifdef HAVE_EVENTFD_SUPPORT
2872         if (done_fd >= 0) {
2873                 fd = done_fd;
2874                 done_fd = -1;
2875
2876                 close(fd);
2877         }
2878 #endif
2879         zstd_fini(&session->zstd_data);
2880         perf_session__delete(session);
2881
2882         if (!opts->no_bpf_event)
2883                 evlist__stop_sb_thread(rec->sb_evlist);
2884         return status;
2885 }
2886
2887 static void callchain_debug(struct callchain_param *callchain)
2888 {
2889         static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2890
2891         pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2892
2893         if (callchain->record_mode == CALLCHAIN_DWARF)
2894                 pr_debug("callchain: stack dump size %d\n",
2895                          callchain->dump_size);
2896 }
2897
2898 int record_opts__parse_callchain(struct record_opts *record,
2899                                  struct callchain_param *callchain,
2900                                  const char *arg, bool unset)
2901 {
2902         int ret;
2903         callchain->enabled = !unset;
2904
2905         /* --no-call-graph */
2906         if (unset) {
2907                 callchain->record_mode = CALLCHAIN_NONE;
2908                 pr_debug("callchain: disabled\n");
2909                 return 0;
2910         }
2911
2912         ret = parse_callchain_record_opt(arg, callchain);
2913         if (!ret) {
2914                 /* Enable data address sampling for DWARF unwind. */
2915                 if (callchain->record_mode == CALLCHAIN_DWARF)
2916                         record->sample_address = true;
2917                 callchain_debug(callchain);
2918         }
2919
2920         return ret;
2921 }
2922
2923 int record_parse_callchain_opt(const struct option *opt,
2924                                const char *arg,
2925                                int unset)
2926 {
2927         return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2928 }
2929
2930 int record_callchain_opt(const struct option *opt,
2931                          const char *arg __maybe_unused,
2932                          int unset __maybe_unused)
2933 {
2934         struct callchain_param *callchain = opt->value;
2935
2936         callchain->enabled = true;
2937
2938         if (callchain->record_mode == CALLCHAIN_NONE)
2939                 callchain->record_mode = CALLCHAIN_FP;
2940
2941         callchain_debug(callchain);
2942         return 0;
2943 }
2944
2945 static int perf_record_config(const char *var, const char *value, void *cb)
2946 {
2947         struct record *rec = cb;
2948
2949         if (!strcmp(var, "record.build-id")) {
2950                 if (!strcmp(value, "cache"))
2951                         rec->no_buildid_cache = false;
2952                 else if (!strcmp(value, "no-cache"))
2953                         rec->no_buildid_cache = true;
2954                 else if (!strcmp(value, "skip"))
2955                         rec->no_buildid = true;
2956                 else if (!strcmp(value, "mmap"))
2957                         rec->buildid_mmap = true;
2958                 else
2959                         return -1;
2960                 return 0;
2961         }
2962         if (!strcmp(var, "record.call-graph")) {
2963                 var = "call-graph.record-mode";
2964                 return perf_default_config(var, value, cb);
2965         }
2966 #ifdef HAVE_AIO_SUPPORT
2967         if (!strcmp(var, "record.aio")) {
2968                 rec->opts.nr_cblocks = strtol(value, NULL, 0);
2969                 if (!rec->opts.nr_cblocks)
2970                         rec->opts.nr_cblocks = nr_cblocks_default;
2971         }
2972 #endif
2973         if (!strcmp(var, "record.debuginfod")) {
2974                 rec->debuginfod.urls = strdup(value);
2975                 if (!rec->debuginfod.urls)
2976                         return -ENOMEM;
2977                 rec->debuginfod.set = true;
2978         }
2979
2980         return 0;
2981 }
2982
2983 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2984 {
2985         struct record *rec = (struct record *)opt->value;
2986
2987         return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2988 }
2989
2990 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2991 {
2992         struct record_opts *opts = (struct record_opts *)opt->value;
2993
2994         if (unset || !str)
2995                 return 0;
2996
2997         if (!strcasecmp(str, "node"))
2998                 opts->affinity = PERF_AFFINITY_NODE;
2999         else if (!strcasecmp(str, "cpu"))
3000                 opts->affinity = PERF_AFFINITY_CPU;
3001
3002         return 0;
3003 }
3004
3005 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
3006 {
3007         mask->nbits = nr_bits;
3008         mask->bits = bitmap_zalloc(mask->nbits);
3009         if (!mask->bits)
3010                 return -ENOMEM;
3011
3012         return 0;
3013 }
3014
3015 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
3016 {
3017         bitmap_free(mask->bits);
3018         mask->nbits = 0;
3019 }
3020
3021 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
3022 {
3023         int ret;
3024
3025         ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
3026         if (ret) {
3027                 mask->affinity.bits = NULL;
3028                 return ret;
3029         }
3030
3031         ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3032         if (ret) {
3033                 record__mmap_cpu_mask_free(&mask->maps);
3034                 mask->maps.bits = NULL;
3035         }
3036
3037         return ret;
3038 }
3039
3040 static void record__thread_mask_free(struct thread_mask *mask)
3041 {
3042         record__mmap_cpu_mask_free(&mask->maps);
3043         record__mmap_cpu_mask_free(&mask->affinity);
3044 }
3045
3046 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3047 {
3048         int s;
3049         struct record_opts *opts = opt->value;
3050
3051         if (unset || !str || !strlen(str)) {
3052                 opts->threads_spec = THREAD_SPEC__CPU;
3053         } else {
3054                 for (s = 1; s < THREAD_SPEC__MAX; s++) {
3055                         if (s == THREAD_SPEC__USER) {
3056                                 opts->threads_user_spec = strdup(str);
3057                                 if (!opts->threads_user_spec)
3058                                         return -ENOMEM;
3059                                 opts->threads_spec = THREAD_SPEC__USER;
3060                                 break;
3061                         }
3062                         if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3063                                 opts->threads_spec = s;
3064                                 break;
3065                         }
3066                 }
3067         }
3068
3069         if (opts->threads_spec == THREAD_SPEC__USER)
3070                 pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3071         else
3072                 pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3073
3074         return 0;
3075 }
3076
3077 static int parse_output_max_size(const struct option *opt,
3078                                  const char *str, int unset)
3079 {
3080         unsigned long *s = (unsigned long *)opt->value;
3081         static struct parse_tag tags_size[] = {
3082                 { .tag  = 'B', .mult = 1       },
3083                 { .tag  = 'K', .mult = 1 << 10 },
3084                 { .tag  = 'M', .mult = 1 << 20 },
3085                 { .tag  = 'G', .mult = 1 << 30 },
3086                 { .tag  = 0 },
3087         };
3088         unsigned long val;
3089
3090         if (unset) {
3091                 *s = 0;
3092                 return 0;
3093         }
3094
3095         val = parse_tag_value(str, tags_size);
3096         if (val != (unsigned long) -1) {
3097                 *s = val;
3098                 return 0;
3099         }
3100
3101         return -1;
3102 }
3103
3104 static int record__parse_mmap_pages(const struct option *opt,
3105                                     const char *str,
3106                                     int unset __maybe_unused)
3107 {
3108         struct record_opts *opts = opt->value;
3109         char *s, *p;
3110         unsigned int mmap_pages;
3111         int ret;
3112
3113         if (!str)
3114                 return -EINVAL;
3115
3116         s = strdup(str);
3117         if (!s)
3118                 return -ENOMEM;
3119
3120         p = strchr(s, ',');
3121         if (p)
3122                 *p = '\0';
3123
3124         if (*s) {
3125                 ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3126                 if (ret)
3127                         goto out_free;
3128                 opts->mmap_pages = mmap_pages;
3129         }
3130
3131         if (!p) {
3132                 ret = 0;
3133                 goto out_free;
3134         }
3135
3136         ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3137         if (ret)
3138                 goto out_free;
3139
3140         opts->auxtrace_mmap_pages = mmap_pages;
3141
3142 out_free:
3143         free(s);
3144         return ret;
3145 }
3146
3147 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3148 {
3149 }
3150
3151 static int parse_control_option(const struct option *opt,
3152                                 const char *str,
3153                                 int unset __maybe_unused)
3154 {
3155         struct record_opts *opts = opt->value;
3156
3157         return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3158 }
3159
3160 static void switch_output_size_warn(struct record *rec)
3161 {
3162         u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3163         struct switch_output *s = &rec->switch_output;
3164
3165         wakeup_size /= 2;
3166
3167         if (s->size < wakeup_size) {
3168                 char buf[100];
3169
3170                 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3171                 pr_warning("WARNING: switch-output data size lower than "
3172                            "wakeup kernel buffer size (%s) "
3173                            "expect bigger perf.data sizes\n", buf);
3174         }
3175 }
3176
3177 static int switch_output_setup(struct record *rec)
3178 {
3179         struct switch_output *s = &rec->switch_output;
3180         static struct parse_tag tags_size[] = {
3181                 { .tag  = 'B', .mult = 1       },
3182                 { .tag  = 'K', .mult = 1 << 10 },
3183                 { .tag  = 'M', .mult = 1 << 20 },
3184                 { .tag  = 'G', .mult = 1 << 30 },
3185                 { .tag  = 0 },
3186         };
3187         static struct parse_tag tags_time[] = {
3188                 { .tag  = 's', .mult = 1        },
3189                 { .tag  = 'm', .mult = 60       },
3190                 { .tag  = 'h', .mult = 60*60    },
3191                 { .tag  = 'd', .mult = 60*60*24 },
3192                 { .tag  = 0 },
3193         };
3194         unsigned long val;
3195
3196         /*
3197          * If we're using --switch-output-events, then we imply its 
3198          * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3199          *  thread to its parent.
3200          */
3201         if (rec->switch_output_event_set) {
3202                 if (record__threads_enabled(rec)) {
3203                         pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3204                         return 0;
3205                 }
3206                 goto do_signal;
3207         }
3208
3209         if (!s->set)
3210                 return 0;
3211
3212         if (record__threads_enabled(rec)) {
3213                 pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3214                 return 0;
3215         }
3216
3217         if (!strcmp(s->str, "signal")) {
3218 do_signal:
3219                 s->signal = true;
3220                 pr_debug("switch-output with SIGUSR2 signal\n");
3221                 goto enabled;
3222         }
3223
3224         val = parse_tag_value(s->str, tags_size);
3225         if (val != (unsigned long) -1) {
3226                 s->size = val;
3227                 pr_debug("switch-output with %s size threshold\n", s->str);
3228                 goto enabled;
3229         }
3230
3231         val = parse_tag_value(s->str, tags_time);
3232         if (val != (unsigned long) -1) {
3233                 s->time = val;
3234                 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3235                          s->str, s->time);
3236                 goto enabled;
3237         }
3238
3239         return -1;
3240
3241 enabled:
3242         rec->timestamp_filename = true;
3243         s->enabled              = true;
3244
3245         if (s->size && !rec->opts.no_buffering)
3246                 switch_output_size_warn(rec);
3247
3248         return 0;
3249 }
3250
3251 static const char * const __record_usage[] = {
3252         "perf record [<options>] [<command>]",
3253         "perf record [<options>] -- <command> [<options>]",
3254         NULL
3255 };
3256 const char * const *record_usage = __record_usage;
3257
3258 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3259                                   struct perf_sample *sample, struct machine *machine)
3260 {
3261         /*
3262          * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3263          * no need to add them twice.
3264          */
3265         if (!(event->header.misc & PERF_RECORD_MISC_USER))
3266                 return 0;
3267         return perf_event__process_mmap(tool, event, sample, machine);
3268 }
3269
3270 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3271                                    struct perf_sample *sample, struct machine *machine)
3272 {
3273         /*
3274          * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3275          * no need to add them twice.
3276          */
3277         if (!(event->header.misc & PERF_RECORD_MISC_USER))
3278                 return 0;
3279
3280         return perf_event__process_mmap2(tool, event, sample, machine);
3281 }
3282
3283 static int process_timestamp_boundary(struct perf_tool *tool,
3284                                       union perf_event *event __maybe_unused,
3285                                       struct perf_sample *sample,
3286                                       struct machine *machine __maybe_unused)
3287 {
3288         struct record *rec = container_of(tool, struct record, tool);
3289
3290         set_timestamp_boundary(rec, sample->time);
3291         return 0;
3292 }
3293
3294 static int parse_record_synth_option(const struct option *opt,
3295                                      const char *str,
3296                                      int unset __maybe_unused)
3297 {
3298         struct record_opts *opts = opt->value;
3299         char *p = strdup(str);
3300
3301         if (p == NULL)
3302                 return -1;
3303
3304         opts->synth = parse_synth_opt(p);
3305         free(p);
3306
3307         if (opts->synth < 0) {
3308                 pr_err("Invalid synth option: %s\n", str);
3309                 return -1;
3310         }
3311         return 0;
3312 }
3313
3314 /*
3315  * XXX Ideally would be local to cmd_record() and passed to a record__new
3316  * because we need to have access to it in record__exit, that is called
3317  * after cmd_record() exits, but since record_options need to be accessible to
3318  * builtin-script, leave it here.
3319  *
3320  * At least we don't ouch it in all the other functions here directly.
3321  *
3322  * Just say no to tons of global variables, sigh.
3323  */
3324 static struct record record = {
3325         .opts = {
3326                 .sample_time         = true,
3327                 .mmap_pages          = UINT_MAX,
3328                 .user_freq           = UINT_MAX,
3329                 .user_interval       = ULLONG_MAX,
3330                 .freq                = 4000,
3331                 .target              = {
3332                         .uses_mmap   = true,
3333                         .default_per_cpu = true,
3334                 },
3335                 .mmap_flush          = MMAP_FLUSH_DEFAULT,
3336                 .nr_threads_synthesize = 1,
3337                 .ctl_fd              = -1,
3338                 .ctl_fd_ack          = -1,
3339                 .synth               = PERF_SYNTH_ALL,
3340         },
3341         .tool = {
3342                 .sample         = process_sample_event,
3343                 .fork           = perf_event__process_fork,
3344                 .exit           = perf_event__process_exit,
3345                 .comm           = perf_event__process_comm,
3346                 .namespaces     = perf_event__process_namespaces,
3347                 .mmap           = build_id__process_mmap,
3348                 .mmap2          = build_id__process_mmap2,
3349                 .itrace_start   = process_timestamp_boundary,
3350                 .aux            = process_timestamp_boundary,
3351                 .ordered_events = true,
3352         },
3353 };
3354
3355 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3356         "\n\t\t\t\tDefault: fp";
3357
3358 static bool dry_run;
3359
3360 static struct parse_events_option_args parse_events_option_args = {
3361         .evlistp = &record.evlist,
3362 };
3363
3364 static struct parse_events_option_args switch_output_parse_events_option_args = {
3365         .evlistp = &record.sb_evlist,
3366 };
3367
3368 /*
3369  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3370  * with it and switch to use the library functions in perf_evlist that came
3371  * from builtin-record.c, i.e. use record_opts,
3372  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3373  * using pipes, etc.
3374  */
3375 static struct option __record_options[] = {
3376         OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3377                      "event selector. use 'perf list' to list available events",
3378                      parse_events_option),
3379         OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3380                      "event filter", parse_filter),
3381         OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3382                            NULL, "don't record events from perf itself",
3383                            exclude_perf),
3384         OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3385                     "record events on existing process id"),
3386         OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3387                     "record events on existing thread id"),
3388         OPT_INTEGER('r', "realtime", &record.realtime_prio,
3389                     "collect data with this RT SCHED_FIFO priority"),
3390         OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3391                     "collect data without buffering"),
3392         OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3393                     "collect raw sample records from all opened counters"),
3394         OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3395                             "system-wide collection from all CPUs"),
3396         OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3397                     "list of cpus to monitor"),
3398         OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3399         OPT_STRING('o', "output", &record.data.path, "file",
3400                     "output file name"),
3401         OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3402                         &record.opts.no_inherit_set,
3403                         "child tasks do not inherit counters"),
3404         OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3405                     "synthesize non-sample events at the end of output"),
3406         OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3407         OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3408         OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3409                     "Fail if the specified frequency can't be used"),
3410         OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3411                      "profile at this frequency",
3412                       record__parse_freq),
3413         OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3414                      "number of mmap data pages and AUX area tracing mmap pages",
3415                      record__parse_mmap_pages),
3416         OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3417                      "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3418                      record__mmap_flush_parse),
3419         OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3420                            NULL, "enables call-graph recording" ,
3421                            &record_callchain_opt),
3422         OPT_CALLBACK(0, "call-graph", &record.opts,
3423                      "record_mode[,record_size]", record_callchain_help,
3424                      &record_parse_callchain_opt),
3425         OPT_INCR('v', "verbose", &verbose,
3426                     "be more verbose (show counter open errors, etc)"),
3427         OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3428         OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3429                     "per thread counts"),
3430         OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3431         OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3432                     "Record the sample physical addresses"),
3433         OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3434                     "Record the sampled data address data page size"),
3435         OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3436                     "Record the sampled code address (ip) page size"),
3437         OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3438         OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3439                     "Record the sample identifier"),
3440         OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3441                         &record.opts.sample_time_set,
3442                         "Record the sample timestamps"),
3443         OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3444                         "Record the sample period"),
3445         OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3446                     "don't sample"),
3447         OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3448                         &record.no_buildid_cache_set,
3449                         "do not update the buildid cache"),
3450         OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3451                         &record.no_buildid_set,
3452                         "do not collect buildids in perf.data"),
3453         OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3454                      "monitor event in cgroup name only",
3455                      parse_cgroups),
3456         OPT_CALLBACK('D', "delay", &record, "ms",
3457                      "ms to wait before starting measurement after program start (-1: start with events disabled), "
3458                      "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3459                      record__parse_event_enable_time),
3460         OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3461         OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3462                    "user to profile"),
3463
3464         OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3465                      "branch any", "sample any taken branches",
3466                      parse_branch_stack),
3467
3468         OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3469                      "branch filter mask", "branch stack filter modes",
3470                      parse_branch_stack),
3471         OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3472                     "sample by weight (on special events only)"),
3473         OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3474                     "sample transaction flags (special events only)"),
3475         OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3476                     "use per-thread mmaps"),
3477         OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3478                     "sample selected machine registers on interrupt,"
3479                     " use '-I?' to list register names", parse_intr_regs),
3480         OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3481                     "sample selected machine registers on interrupt,"
3482                     " use '--user-regs=?' to list register names", parse_user_regs),
3483         OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3484                     "Record running/enabled time of read (:S) events"),
3485         OPT_CALLBACK('k', "clockid", &record.opts,
3486         "clockid", "clockid to use for events, see clock_gettime()",
3487         parse_clockid),
3488         OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3489                           "opts", "AUX area tracing Snapshot Mode", ""),
3490         OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3491                           "opts", "sample AUX area", ""),
3492         OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3493                         "per thread proc mmap processing timeout in ms"),
3494         OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3495                     "Record namespaces events"),
3496         OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3497                     "Record cgroup events"),
3498         OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3499                         &record.opts.record_switch_events_set,
3500                         "Record context switch events"),
3501         OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3502                          "Configure all used events to run in kernel space.",
3503                          PARSE_OPT_EXCLUSIVE),
3504         OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3505                          "Configure all used events to run in user space.",
3506                          PARSE_OPT_EXCLUSIVE),
3507         OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3508                     "collect kernel callchains"),
3509         OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3510                     "collect user callchains"),
3511         OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3512                    "file", "vmlinux pathname"),
3513         OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3514                     "Record build-id of all DSOs regardless of hits"),
3515         OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3516                     "Record build-id in map events"),
3517         OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3518                     "append timestamp to output filename"),
3519         OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3520                     "Record timestamp boundary (time of first/last samples)"),
3521         OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3522                           &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3523                           "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3524                           "signal"),
3525         OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3526                          &record.switch_output_event_set, "switch output event",
3527                          "switch output event selector. use 'perf list' to list available events",
3528                          parse_events_option_new_evlist),
3529         OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3530                    "Limit number of switch output generated files"),
3531         OPT_BOOLEAN(0, "dry-run", &dry_run,
3532                     "Parse options then exit"),
3533 #ifdef HAVE_AIO_SUPPORT
3534         OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3535                      &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3536                      record__aio_parse),
3537 #endif
3538         OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3539                      "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3540                      record__parse_affinity),
3541 #ifdef HAVE_ZSTD_SUPPORT
3542         OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3543                             "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3544                             record__parse_comp_level),
3545 #endif
3546         OPT_CALLBACK(0, "max-size", &record.output_max_size,
3547                      "size", "Limit the maximum size of the output file", parse_output_max_size),
3548         OPT_UINTEGER(0, "num-thread-synthesize",
3549                      &record.opts.nr_threads_synthesize,
3550                      "number of threads to run for event synthesis"),
3551 #ifdef HAVE_LIBPFM
3552         OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3553                 "libpfm4 event selector. use 'perf list' to list available events",
3554                 parse_libpfm_events_option),
3555 #endif
3556         OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3557                      "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3558                      "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3559                      "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3560                      "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3561                       parse_control_option),
3562         OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3563                      "Fine-tune event synthesis: default=all", parse_record_synth_option),
3564         OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3565                           &record.debuginfod.set, "debuginfod urls",
3566                           "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3567                           "system"),
3568         OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3569                             "write collected trace data into several data files using parallel threads",
3570                             record__parse_threads),
3571         OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3572         OPT_END()
3573 };
3574
3575 struct option *record_options = __record_options;
3576
3577 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3578 {
3579         struct perf_cpu cpu;
3580         int idx;
3581
3582         if (cpu_map__is_dummy(cpus))
3583                 return 0;
3584
3585         perf_cpu_map__for_each_cpu_skip_any(cpu, idx, cpus) {
3586                 /* Return ENODEV is input cpu is greater than max cpu */
3587                 if ((unsigned long)cpu.cpu > mask->nbits)
3588                         return -ENODEV;
3589                 __set_bit(cpu.cpu, mask->bits);
3590         }
3591
3592         return 0;
3593 }
3594
3595 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3596 {
3597         struct perf_cpu_map *cpus;
3598
3599         cpus = perf_cpu_map__new(mask_spec);
3600         if (!cpus)
3601                 return -ENOMEM;
3602
3603         bitmap_zero(mask->bits, mask->nbits);
3604         if (record__mmap_cpu_mask_init(mask, cpus))
3605                 return -ENODEV;
3606
3607         perf_cpu_map__put(cpus);
3608
3609         return 0;
3610 }
3611
3612 static void record__free_thread_masks(struct record *rec, int nr_threads)
3613 {
3614         int t;
3615
3616         if (rec->thread_masks)
3617                 for (t = 0; t < nr_threads; t++)
3618                         record__thread_mask_free(&rec->thread_masks[t]);
3619
3620         zfree(&rec->thread_masks);
3621 }
3622
3623 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3624 {
3625         int t, ret;
3626
3627         rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3628         if (!rec->thread_masks) {
3629                 pr_err("Failed to allocate thread masks\n");
3630                 return -ENOMEM;
3631         }
3632
3633         for (t = 0; t < nr_threads; t++) {
3634                 ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3635                 if (ret) {
3636                         pr_err("Failed to allocate thread masks[%d]\n", t);
3637                         goto out_free;
3638                 }
3639         }
3640
3641         return 0;
3642
3643 out_free:
3644         record__free_thread_masks(rec, nr_threads);
3645
3646         return ret;
3647 }
3648
3649 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3650 {
3651         int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3652
3653         ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3654         if (ret)
3655                 return ret;
3656
3657         rec->nr_threads = nr_cpus;
3658         pr_debug("nr_threads: %d\n", rec->nr_threads);
3659
3660         for (t = 0; t < rec->nr_threads; t++) {
3661                 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3662                 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3663                 if (verbose > 0) {
3664                         pr_debug("thread_masks[%d]: ", t);
3665                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3666                         pr_debug("thread_masks[%d]: ", t);
3667                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3668                 }
3669         }
3670
3671         return 0;
3672 }
3673
3674 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3675                                           const char **maps_spec, const char **affinity_spec,
3676                                           u32 nr_spec)
3677 {
3678         u32 s;
3679         int ret = 0, t = 0;
3680         struct mmap_cpu_mask cpus_mask;
3681         struct thread_mask thread_mask, full_mask, *thread_masks;
3682
3683         ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3684         if (ret) {
3685                 pr_err("Failed to allocate CPUs mask\n");
3686                 return ret;
3687         }
3688
3689         ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3690         if (ret) {
3691                 pr_err("Failed to init cpu mask\n");
3692                 goto out_free_cpu_mask;
3693         }
3694
3695         ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3696         if (ret) {
3697                 pr_err("Failed to allocate full mask\n");
3698                 goto out_free_cpu_mask;
3699         }
3700
3701         ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3702         if (ret) {
3703                 pr_err("Failed to allocate thread mask\n");
3704                 goto out_free_full_and_cpu_masks;
3705         }
3706
3707         for (s = 0; s < nr_spec; s++) {
3708                 ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3709                 if (ret) {
3710                         pr_err("Failed to initialize maps thread mask\n");
3711                         goto out_free;
3712                 }
3713                 ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3714                 if (ret) {
3715                         pr_err("Failed to initialize affinity thread mask\n");
3716                         goto out_free;
3717                 }
3718
3719                 /* ignore invalid CPUs but do not allow empty masks */
3720                 if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3721                                 cpus_mask.bits, thread_mask.maps.nbits)) {
3722                         pr_err("Empty maps mask: %s\n", maps_spec[s]);
3723                         ret = -EINVAL;
3724                         goto out_free;
3725                 }
3726                 if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3727                                 cpus_mask.bits, thread_mask.affinity.nbits)) {
3728                         pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3729                         ret = -EINVAL;
3730                         goto out_free;
3731                 }
3732
3733                 /* do not allow intersection with other masks (full_mask) */
3734                 if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3735                                       thread_mask.maps.nbits)) {
3736                         pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3737                         ret = -EINVAL;
3738                         goto out_free;
3739                 }
3740                 if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3741                                       thread_mask.affinity.nbits)) {
3742                         pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3743                         ret = -EINVAL;
3744                         goto out_free;
3745                 }
3746
3747                 bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3748                           thread_mask.maps.bits, full_mask.maps.nbits);
3749                 bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3750                           thread_mask.affinity.bits, full_mask.maps.nbits);
3751
3752                 thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3753                 if (!thread_masks) {
3754                         pr_err("Failed to reallocate thread masks\n");
3755                         ret = -ENOMEM;
3756                         goto out_free;
3757                 }
3758                 rec->thread_masks = thread_masks;
3759                 rec->thread_masks[t] = thread_mask;
3760                 if (verbose > 0) {
3761                         pr_debug("thread_masks[%d]: ", t);
3762                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3763                         pr_debug("thread_masks[%d]: ", t);
3764                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3765                 }
3766                 t++;
3767                 ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3768                 if (ret) {
3769                         pr_err("Failed to allocate thread mask\n");
3770                         goto out_free_full_and_cpu_masks;
3771                 }
3772         }
3773         rec->nr_threads = t;
3774         pr_debug("nr_threads: %d\n", rec->nr_threads);
3775         if (!rec->nr_threads)
3776                 ret = -EINVAL;
3777
3778 out_free:
3779         record__thread_mask_free(&thread_mask);
3780 out_free_full_and_cpu_masks:
3781         record__thread_mask_free(&full_mask);
3782 out_free_cpu_mask:
3783         record__mmap_cpu_mask_free(&cpus_mask);
3784
3785         return ret;
3786 }
3787
3788 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3789 {
3790         int ret;
3791         struct cpu_topology *topo;
3792
3793         topo = cpu_topology__new();
3794         if (!topo) {
3795                 pr_err("Failed to allocate CPU topology\n");
3796                 return -ENOMEM;
3797         }
3798
3799         ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3800                                              topo->core_cpus_list, topo->core_cpus_lists);
3801         cpu_topology__delete(topo);
3802
3803         return ret;
3804 }
3805
3806 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3807 {
3808         int ret;
3809         struct cpu_topology *topo;
3810
3811         topo = cpu_topology__new();
3812         if (!topo) {
3813                 pr_err("Failed to allocate CPU topology\n");
3814                 return -ENOMEM;
3815         }
3816
3817         ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3818                                              topo->package_cpus_list, topo->package_cpus_lists);
3819         cpu_topology__delete(topo);
3820
3821         return ret;
3822 }
3823
3824 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3825 {
3826         u32 s;
3827         int ret;
3828         const char **spec;
3829         struct numa_topology *topo;
3830
3831         topo = numa_topology__new();
3832         if (!topo) {
3833                 pr_err("Failed to allocate NUMA topology\n");
3834                 return -ENOMEM;
3835         }
3836
3837         spec = zalloc(topo->nr * sizeof(char *));
3838         if (!spec) {
3839                 pr_err("Failed to allocate NUMA spec\n");
3840                 ret = -ENOMEM;
3841                 goto out_delete_topo;
3842         }
3843         for (s = 0; s < topo->nr; s++)
3844                 spec[s] = topo->nodes[s].cpus;
3845
3846         ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3847
3848         zfree(&spec);
3849
3850 out_delete_topo:
3851         numa_topology__delete(topo);
3852
3853         return ret;
3854 }
3855
3856 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3857 {
3858         int t, ret;
3859         u32 s, nr_spec = 0;
3860         char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3861         char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3862
3863         for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3864                 spec = strtok_r(user_spec, ":", &spec_ptr);
3865                 if (spec == NULL)
3866                         break;
3867                 pr_debug2("threads_spec[%d]: %s\n", t, spec);
3868                 mask = strtok_r(spec, "/", &mask_ptr);
3869                 if (mask == NULL)
3870                         break;
3871                 pr_debug2("  maps mask: %s\n", mask);
3872                 tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3873                 if (!tmp_spec) {
3874                         pr_err("Failed to reallocate maps spec\n");
3875                         ret = -ENOMEM;
3876                         goto out_free;
3877                 }
3878                 maps_spec = tmp_spec;
3879                 maps_spec[nr_spec] = dup_mask = strdup(mask);
3880                 if (!maps_spec[nr_spec]) {
3881                         pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3882                         ret = -ENOMEM;
3883                         goto out_free;
3884                 }
3885                 mask = strtok_r(NULL, "/", &mask_ptr);
3886                 if (mask == NULL) {
3887                         pr_err("Invalid thread maps or affinity specs\n");
3888                         ret = -EINVAL;
3889                         goto out_free;
3890                 }
3891                 pr_debug2("  affinity mask: %s\n", mask);
3892                 tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3893                 if (!tmp_spec) {
3894                         pr_err("Failed to reallocate affinity spec\n");
3895                         ret = -ENOMEM;
3896                         goto out_free;
3897                 }
3898                 affinity_spec = tmp_spec;
3899                 affinity_spec[nr_spec] = strdup(mask);
3900                 if (!affinity_spec[nr_spec]) {
3901                         pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3902                         ret = -ENOMEM;
3903                         goto out_free;
3904                 }
3905                 dup_mask = NULL;
3906                 nr_spec++;
3907         }
3908
3909         ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3910                                              (const char **)affinity_spec, nr_spec);
3911
3912 out_free:
3913         free(dup_mask);
3914         for (s = 0; s < nr_spec; s++) {
3915                 if (maps_spec)
3916                         free(maps_spec[s]);
3917                 if (affinity_spec)
3918                         free(affinity_spec[s]);
3919         }
3920         free(affinity_spec);
3921         free(maps_spec);
3922
3923         return ret;
3924 }
3925
3926 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3927 {
3928         int ret;
3929
3930         ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3931         if (ret)
3932                 return ret;
3933
3934         if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3935                 return -ENODEV;
3936
3937         rec->nr_threads = 1;
3938
3939         return 0;
3940 }
3941
3942 static int record__init_thread_masks(struct record *rec)
3943 {
3944         int ret = 0;
3945         struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3946
3947         if (!record__threads_enabled(rec))
3948                 return record__init_thread_default_masks(rec, cpus);
3949
3950         if (evlist__per_thread(rec->evlist)) {
3951                 pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3952                 return -EINVAL;
3953         }
3954
3955         switch (rec->opts.threads_spec) {
3956         case THREAD_SPEC__CPU:
3957                 ret = record__init_thread_cpu_masks(rec, cpus);
3958                 break;
3959         case THREAD_SPEC__CORE:
3960                 ret = record__init_thread_core_masks(rec, cpus);
3961                 break;
3962         case THREAD_SPEC__PACKAGE:
3963                 ret = record__init_thread_package_masks(rec, cpus);
3964                 break;
3965         case THREAD_SPEC__NUMA:
3966                 ret = record__init_thread_numa_masks(rec, cpus);
3967                 break;
3968         case THREAD_SPEC__USER:
3969                 ret = record__init_thread_user_masks(rec, cpus);
3970                 break;
3971         default:
3972                 break;
3973         }
3974
3975         return ret;
3976 }
3977
3978 int cmd_record(int argc, const char **argv)
3979 {
3980         int err;
3981         struct record *rec = &record;
3982         char errbuf[BUFSIZ];
3983
3984         setlocale(LC_ALL, "");
3985
3986 #ifndef HAVE_BPF_SKEL
3987 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3988         set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3989 # undef set_nobuild
3990 #endif
3991
3992         /* Disable eager loading of kernel symbols that adds overhead to perf record. */
3993         symbol_conf.lazy_load_kernel_maps = true;
3994         rec->opts.affinity = PERF_AFFINITY_SYS;
3995
3996         rec->evlist = evlist__new();
3997         if (rec->evlist == NULL)
3998                 return -ENOMEM;
3999
4000         err = perf_config(perf_record_config, rec);
4001         if (err)
4002                 return err;
4003
4004         argc = parse_options(argc, argv, record_options, record_usage,
4005                             PARSE_OPT_STOP_AT_NON_OPTION);
4006         if (quiet)
4007                 perf_quiet_option();
4008
4009         err = symbol__validate_sym_arguments();
4010         if (err)
4011                 return err;
4012
4013         perf_debuginfod_setup(&record.debuginfod);
4014
4015         /* Make system wide (-a) the default target. */
4016         if (!argc && target__none(&rec->opts.target))
4017                 rec->opts.target.system_wide = true;
4018
4019         if (nr_cgroups && !rec->opts.target.system_wide) {
4020                 usage_with_options_msg(record_usage, record_options,
4021                         "cgroup monitoring only available in system-wide mode");
4022
4023         }
4024
4025         if (rec->buildid_mmap) {
4026                 if (!perf_can_record_build_id()) {
4027                         pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
4028                         err = -EINVAL;
4029                         goto out_opts;
4030                 }
4031                 pr_debug("Enabling build id in mmap2 events.\n");
4032                 /* Enable mmap build id synthesizing. */
4033                 symbol_conf.buildid_mmap2 = true;
4034                 /* Enable perf_event_attr::build_id bit. */
4035                 rec->opts.build_id = true;
4036                 /* Disable build id cache. */
4037                 rec->no_buildid = true;
4038         }
4039
4040         if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4041                 pr_err("Kernel has no cgroup sampling support.\n");
4042                 err = -EINVAL;
4043                 goto out_opts;
4044         }
4045
4046         if (rec->opts.kcore)
4047                 rec->opts.text_poke = true;
4048
4049         if (rec->opts.kcore || record__threads_enabled(rec))
4050                 rec->data.is_dir = true;
4051
4052         if (record__threads_enabled(rec)) {
4053                 if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4054                         pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4055                         goto out_opts;
4056                 }
4057                 if (record__aio_enabled(rec)) {
4058                         pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4059                         goto out_opts;
4060                 }
4061         }
4062
4063         if (rec->opts.comp_level != 0) {
4064                 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4065                 rec->no_buildid = true;
4066         }
4067
4068         if (rec->opts.record_switch_events &&
4069             !perf_can_record_switch_events()) {
4070                 ui__error("kernel does not support recording context switch events\n");
4071                 parse_options_usage(record_usage, record_options, "switch-events", 0);
4072                 err = -EINVAL;
4073                 goto out_opts;
4074         }
4075
4076         if (switch_output_setup(rec)) {
4077                 parse_options_usage(record_usage, record_options, "switch-output", 0);
4078                 err = -EINVAL;
4079                 goto out_opts;
4080         }
4081
4082         if (rec->switch_output.time) {
4083                 signal(SIGALRM, alarm_sig_handler);
4084                 alarm(rec->switch_output.time);
4085         }
4086
4087         if (rec->switch_output.num_files) {
4088                 rec->switch_output.filenames = calloc(rec->switch_output.num_files,
4089                                                       sizeof(char *));
4090                 if (!rec->switch_output.filenames) {
4091                         err = -EINVAL;
4092                         goto out_opts;
4093                 }
4094         }
4095
4096         if (rec->timestamp_filename && record__threads_enabled(rec)) {
4097                 rec->timestamp_filename = false;
4098                 pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4099         }
4100
4101         /*
4102          * Allow aliases to facilitate the lookup of symbols for address
4103          * filters. Refer to auxtrace_parse_filters().
4104          */
4105         symbol_conf.allow_aliases = true;
4106
4107         symbol__init(NULL);
4108
4109         err = record__auxtrace_init(rec);
4110         if (err)
4111                 goto out;
4112
4113         if (dry_run)
4114                 goto out;
4115
4116         err = -ENOMEM;
4117
4118         if (rec->no_buildid_cache || rec->no_buildid) {
4119                 disable_buildid_cache();
4120         } else if (rec->switch_output.enabled) {
4121                 /*
4122                  * In 'perf record --switch-output', disable buildid
4123                  * generation by default to reduce data file switching
4124                  * overhead. Still generate buildid if they are required
4125                  * explicitly using
4126                  *
4127                  *  perf record --switch-output --no-no-buildid \
4128                  *              --no-no-buildid-cache
4129                  *
4130                  * Following code equals to:
4131                  *
4132                  * if ((rec->no_buildid || !rec->no_buildid_set) &&
4133                  *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4134                  *         disable_buildid_cache();
4135                  */
4136                 bool disable = true;
4137
4138                 if (rec->no_buildid_set && !rec->no_buildid)
4139                         disable = false;
4140                 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4141                         disable = false;
4142                 if (disable) {
4143                         rec->no_buildid = true;
4144                         rec->no_buildid_cache = true;
4145                         disable_buildid_cache();
4146                 }
4147         }
4148
4149         if (record.opts.overwrite)
4150                 record.opts.tail_synthesize = true;
4151
4152         if (rec->evlist->core.nr_entries == 0) {
4153                 bool can_profile_kernel = perf_event_paranoid_check(1);
4154
4155                 err = parse_event(rec->evlist, can_profile_kernel ? "cycles:P" : "cycles:Pu");
4156                 if (err)
4157                         goto out;
4158         }
4159
4160         if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4161                 rec->opts.no_inherit = true;
4162
4163         err = target__validate(&rec->opts.target);
4164         if (err) {
4165                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4166                 ui__warning("%s\n", errbuf);
4167         }
4168
4169         err = target__parse_uid(&rec->opts.target);
4170         if (err) {
4171                 int saved_errno = errno;
4172
4173                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4174                 ui__error("%s", errbuf);
4175
4176                 err = -saved_errno;
4177                 goto out;
4178         }
4179
4180         /* Enable ignoring missing threads when -u/-p option is defined. */
4181         rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4182
4183         evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4184
4185         if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4186                 arch__add_leaf_frame_record_opts(&rec->opts);
4187
4188         err = -ENOMEM;
4189         if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4190                 if (rec->opts.target.pid != NULL) {
4191                         pr_err("Couldn't create thread/CPU maps: %s\n",
4192                                 errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4193                         goto out;
4194                 }
4195                 else
4196                         usage_with_options(record_usage, record_options);
4197         }
4198
4199         err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4200         if (err)
4201                 goto out;
4202
4203         /*
4204          * We take all buildids when the file contains
4205          * AUX area tracing data because we do not decode the
4206          * trace because it would take too long.
4207          */
4208         if (rec->opts.full_auxtrace)
4209                 rec->buildid_all = true;
4210
4211         if (rec->opts.text_poke) {
4212                 err = record__config_text_poke(rec->evlist);
4213                 if (err) {
4214                         pr_err("record__config_text_poke failed, error %d\n", err);
4215                         goto out;
4216                 }
4217         }
4218
4219         if (rec->off_cpu) {
4220                 err = record__config_off_cpu(rec);
4221                 if (err) {
4222                         pr_err("record__config_off_cpu failed, error %d\n", err);
4223                         goto out;
4224                 }
4225         }
4226
4227         if (record_opts__config(&rec->opts)) {
4228                 err = -EINVAL;
4229                 goto out;
4230         }
4231
4232         err = record__config_tracking_events(rec);
4233         if (err) {
4234                 pr_err("record__config_tracking_events failed, error %d\n", err);
4235                 goto out;
4236         }
4237
4238         err = record__init_thread_masks(rec);
4239         if (err) {
4240                 pr_err("Failed to initialize parallel data streaming masks\n");
4241                 goto out;
4242         }
4243
4244         if (rec->opts.nr_cblocks > nr_cblocks_max)
4245                 rec->opts.nr_cblocks = nr_cblocks_max;
4246         pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4247
4248         pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4249         pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4250
4251         if (rec->opts.comp_level > comp_level_max)
4252                 rec->opts.comp_level = comp_level_max;
4253         pr_debug("comp level: %d\n", rec->opts.comp_level);
4254
4255         err = __cmd_record(&record, argc, argv);
4256 out:
4257         evlist__delete(rec->evlist);
4258         symbol__exit();
4259         auxtrace_record__free(rec->itr);
4260 out_opts:
4261         record__free_thread_masks(rec, rec->nr_threads);
4262         rec->nr_threads = 0;
4263         evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4264         return err;
4265 }
4266
4267 static void snapshot_sig_handler(int sig __maybe_unused)
4268 {
4269         struct record *rec = &record;
4270
4271         hit_auxtrace_snapshot_trigger(rec);
4272
4273         if (switch_output_signal(rec))
4274                 trigger_hit(&switch_output_trigger);
4275 }
4276
4277 static void alarm_sig_handler(int sig __maybe_unused)
4278 {
4279         struct record *rec = &record;
4280
4281         if (switch_output_time(rec))
4282                 trigger_hit(&switch_output_trigger);
4283 }