GNU Linux-libre 4.19.211-gnu1
[releases.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/cgroup.h"
23 #include "util/color.h"
24 #include "util/debug.h"
25 #include "util/env.h"
26 #include "util/event.h"
27 #include "util/evlist.h"
28 #include <subcmd/exec-cmd.h>
29 #include "util/machine.h"
30 #include "util/path.h"
31 #include "util/session.h"
32 #include "util/thread.h"
33 #include <subcmd/parse-options.h>
34 #include "util/strlist.h"
35 #include "util/intlist.h"
36 #include "util/thread_map.h"
37 #include "util/stat.h"
38 #include "trace/beauty/beauty.h"
39 #include "trace-event.h"
40 #include "util/parse-events.h"
41 #include "util/bpf-loader.h"
42 #include "callchain.h"
43 #include "print_binary.h"
44 #include "string2.h"
45 #include "syscalltbl.h"
46 #include "rb_resort.h"
47
48 #include <errno.h>
49 #include <inttypes.h>
50 #include <poll.h>
51 #include <signal.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <linux/err.h>
55 #include <linux/filter.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60 #include <fcntl.h>
61
62 #include "sane_ctype.h"
63
64 #ifndef O_CLOEXEC
65 # define O_CLOEXEC              02000000
66 #endif
67
68 #ifndef F_LINUX_SPECIFIC_BASE
69 # define F_LINUX_SPECIFIC_BASE  1024
70 #endif
71
72 struct trace {
73         struct perf_tool        tool;
74         struct syscalltbl       *sctbl;
75         struct {
76                 int             max;
77                 struct syscall  *table;
78                 struct {
79                         struct perf_evsel *sys_enter,
80                                           *sys_exit,
81                                           *augmented;
82                 }               events;
83         } syscalls;
84         struct record_opts      opts;
85         struct perf_evlist      *evlist;
86         struct machine          *host;
87         struct thread           *current;
88         struct cgroup           *cgroup;
89         u64                     base_time;
90         FILE                    *output;
91         unsigned long           nr_events;
92         struct strlist          *ev_qualifier;
93         struct {
94                 size_t          nr;
95                 int             *entries;
96         }                       ev_qualifier_ids;
97         struct {
98                 size_t          nr;
99                 pid_t           *entries;
100         }                       filter_pids;
101         double                  duration_filter;
102         double                  runtime_ms;
103         struct {
104                 u64             vfs_getname,
105                                 proc_getname;
106         } stats;
107         unsigned int            max_stack;
108         unsigned int            min_stack;
109         bool                    not_ev_qualifier;
110         bool                    live;
111         bool                    full_time;
112         bool                    sched;
113         bool                    multiple_threads;
114         bool                    summary;
115         bool                    summary_only;
116         bool                    failure_only;
117         bool                    show_comm;
118         bool                    print_sample;
119         bool                    show_tool_stats;
120         bool                    trace_syscalls;
121         bool                    kernel_syscallchains;
122         bool                    force;
123         bool                    vfs_getname;
124         int                     trace_pgfaults;
125 };
126
127 struct tp_field {
128         int offset;
129         union {
130                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
131                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
132         };
133 };
134
135 #define TP_UINT_FIELD(bits) \
136 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
137 { \
138         u##bits value; \
139         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
140         return value;  \
141 }
142
143 TP_UINT_FIELD(8);
144 TP_UINT_FIELD(16);
145 TP_UINT_FIELD(32);
146 TP_UINT_FIELD(64);
147
148 #define TP_UINT_FIELD__SWAPPED(bits) \
149 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
150 { \
151         u##bits value; \
152         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
153         return bswap_##bits(value);\
154 }
155
156 TP_UINT_FIELD__SWAPPED(16);
157 TP_UINT_FIELD__SWAPPED(32);
158 TP_UINT_FIELD__SWAPPED(64);
159
160 static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap)
161 {
162         field->offset = offset;
163
164         switch (size) {
165         case 1:
166                 field->integer = tp_field__u8;
167                 break;
168         case 2:
169                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
170                 break;
171         case 4:
172                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
173                 break;
174         case 8:
175                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
176                 break;
177         default:
178                 return -1;
179         }
180
181         return 0;
182 }
183
184 static int tp_field__init_uint(struct tp_field *field, struct format_field *format_field, bool needs_swap)
185 {
186         return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap);
187 }
188
189 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
190 {
191         return sample->raw_data + field->offset;
192 }
193
194 static int __tp_field__init_ptr(struct tp_field *field, int offset)
195 {
196         field->offset = offset;
197         field->pointer = tp_field__ptr;
198         return 0;
199 }
200
201 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
202 {
203         return __tp_field__init_ptr(field, format_field->offset);
204 }
205
206 struct syscall_tp {
207         struct tp_field id;
208         union {
209                 struct tp_field args, ret;
210         };
211 };
212
213 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
214                                           struct tp_field *field,
215                                           const char *name)
216 {
217         struct format_field *format_field = perf_evsel__field(evsel, name);
218
219         if (format_field == NULL)
220                 return -1;
221
222         return tp_field__init_uint(field, format_field, evsel->needs_swap);
223 }
224
225 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
226         ({ struct syscall_tp *sc = evsel->priv;\
227            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
228
229 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
230                                          struct tp_field *field,
231                                          const char *name)
232 {
233         struct format_field *format_field = perf_evsel__field(evsel, name);
234
235         if (format_field == NULL)
236                 return -1;
237
238         return tp_field__init_ptr(field, format_field);
239 }
240
241 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
242         ({ struct syscall_tp *sc = evsel->priv;\
243            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
244
245 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
246 {
247         zfree(&evsel->priv);
248         perf_evsel__delete(evsel);
249 }
250
251 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel)
252 {
253         struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
254
255         if (evsel->priv != NULL) {
256                 if (perf_evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr"))
257                         goto out_delete;
258                 return 0;
259         }
260
261         return -ENOMEM;
262 out_delete:
263         zfree(&evsel->priv);
264         return -ENOENT;
265 }
266
267 static int perf_evsel__init_augmented_syscall_tp(struct perf_evsel *evsel)
268 {
269         struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
270
271         if (evsel->priv != NULL) {       /* field, sizeof_field, offsetof_field */
272                 if (__tp_field__init_uint(&sc->id, sizeof(long), sizeof(long long), evsel->needs_swap))
273                         goto out_delete;
274
275                 return 0;
276         }
277
278         return -ENOMEM;
279 out_delete:
280         zfree(&evsel->priv);
281         return -EINVAL;
282 }
283
284 static int perf_evsel__init_augmented_syscall_tp_args(struct perf_evsel *evsel)
285 {
286         struct syscall_tp *sc = evsel->priv;
287
288         return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64));
289 }
290
291 static int perf_evsel__init_raw_syscall_tp(struct perf_evsel *evsel, void *handler)
292 {
293         evsel->priv = malloc(sizeof(struct syscall_tp));
294         if (evsel->priv != NULL) {
295                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
296                         goto out_delete;
297
298                 evsel->handler = handler;
299                 return 0;
300         }
301
302         return -ENOMEM;
303
304 out_delete:
305         zfree(&evsel->priv);
306         return -ENOENT;
307 }
308
309 static struct perf_evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler)
310 {
311         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
312
313         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
314         if (IS_ERR(evsel))
315                 evsel = perf_evsel__newtp("syscalls", direction);
316
317         if (IS_ERR(evsel))
318                 return NULL;
319
320         if (perf_evsel__init_raw_syscall_tp(evsel, handler))
321                 goto out_delete;
322
323         return evsel;
324
325 out_delete:
326         perf_evsel__delete_priv(evsel);
327         return NULL;
328 }
329
330 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
331         ({ struct syscall_tp *fields = evsel->priv; \
332            fields->name.integer(&fields->name, sample); })
333
334 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
335         ({ struct syscall_tp *fields = evsel->priv; \
336            fields->name.pointer(&fields->name, sample); })
337
338 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
339 {
340         int idx = val - sa->offset;
341
342         if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL)
343                 return scnprintf(bf, size, intfmt, val);
344
345         return scnprintf(bf, size, "%s", sa->entries[idx]);
346 }
347
348 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
349                                                 const char *intfmt,
350                                                 struct syscall_arg *arg)
351 {
352         return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
353 }
354
355 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
356                                               struct syscall_arg *arg)
357 {
358         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
359 }
360
361 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
362
363 struct strarrays {
364         int             nr_entries;
365         struct strarray **entries;
366 };
367
368 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
369         .nr_entries = ARRAY_SIZE(array), \
370         .entries = array, \
371 }
372
373 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
374                                         struct syscall_arg *arg)
375 {
376         struct strarrays *sas = arg->parm;
377         int i;
378
379         for (i = 0; i < sas->nr_entries; ++i) {
380                 struct strarray *sa = sas->entries[i];
381                 int idx = arg->val - sa->offset;
382
383                 if (idx >= 0 && idx < sa->nr_entries) {
384                         if (sa->entries[idx] == NULL)
385                                 break;
386                         return scnprintf(bf, size, "%s", sa->entries[idx]);
387                 }
388         }
389
390         return scnprintf(bf, size, "%d", arg->val);
391 }
392
393 #ifndef AT_FDCWD
394 #define AT_FDCWD        -100
395 #endif
396
397 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
398                                            struct syscall_arg *arg)
399 {
400         int fd = arg->val;
401
402         if (fd == AT_FDCWD)
403                 return scnprintf(bf, size, "CWD");
404
405         return syscall_arg__scnprintf_fd(bf, size, arg);
406 }
407
408 #define SCA_FDAT syscall_arg__scnprintf_fd_at
409
410 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
411                                               struct syscall_arg *arg);
412
413 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
414
415 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
416 {
417         return scnprintf(bf, size, "%#lx", arg->val);
418 }
419
420 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
421 {
422         return scnprintf(bf, size, "%d", arg->val);
423 }
424
425 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
426 {
427         return scnprintf(bf, size, "%ld", arg->val);
428 }
429
430 static const char *bpf_cmd[] = {
431         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
432         "MAP_GET_NEXT_KEY", "PROG_LOAD",
433 };
434 static DEFINE_STRARRAY(bpf_cmd);
435
436 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
437 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
438
439 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
440 static DEFINE_STRARRAY(itimers);
441
442 static const char *keyctl_options[] = {
443         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
444         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
445         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
446         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
447         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
448 };
449 static DEFINE_STRARRAY(keyctl_options);
450
451 static const char *whences[] = { "SET", "CUR", "END",
452 #ifdef SEEK_DATA
453 "DATA",
454 #endif
455 #ifdef SEEK_HOLE
456 "HOLE",
457 #endif
458 };
459 static DEFINE_STRARRAY(whences);
460
461 static const char *fcntl_cmds[] = {
462         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
463         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
464         "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
465         "GETOWNER_UIDS",
466 };
467 static DEFINE_STRARRAY(fcntl_cmds);
468
469 static const char *fcntl_linux_specific_cmds[] = {
470         "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
471         "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
472         "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
473 };
474
475 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
476
477 static struct strarray *fcntl_cmds_arrays[] = {
478         &strarray__fcntl_cmds,
479         &strarray__fcntl_linux_specific_cmds,
480 };
481
482 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
483
484 static const char *rlimit_resources[] = {
485         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
486         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
487         "RTTIME",
488 };
489 static DEFINE_STRARRAY(rlimit_resources);
490
491 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
492 static DEFINE_STRARRAY(sighow);
493
494 static const char *clockid[] = {
495         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
496         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
497         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
498 };
499 static DEFINE_STRARRAY(clockid);
500
501 static const char *socket_families[] = {
502         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
503         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
504         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
505         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
506         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
507         "ALG", "NFC", "VSOCK",
508 };
509 static DEFINE_STRARRAY(socket_families);
510
511 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
512                                                  struct syscall_arg *arg)
513 {
514         size_t printed = 0;
515         int mode = arg->val;
516
517         if (mode == F_OK) /* 0 */
518                 return scnprintf(bf, size, "F");
519 #define P_MODE(n) \
520         if (mode & n##_OK) { \
521                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
522                 mode &= ~n##_OK; \
523         }
524
525         P_MODE(R);
526         P_MODE(W);
527         P_MODE(X);
528 #undef P_MODE
529
530         if (mode)
531                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
532
533         return printed;
534 }
535
536 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
537
538 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
539                                               struct syscall_arg *arg);
540
541 #define SCA_FILENAME syscall_arg__scnprintf_filename
542
543 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
544                                                 struct syscall_arg *arg)
545 {
546         int printed = 0, flags = arg->val;
547
548 #define P_FLAG(n) \
549         if (flags & O_##n) { \
550                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
551                 flags &= ~O_##n; \
552         }
553
554         P_FLAG(CLOEXEC);
555         P_FLAG(NONBLOCK);
556 #undef P_FLAG
557
558         if (flags)
559                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
560
561         return printed;
562 }
563
564 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
565
566 #ifndef GRND_NONBLOCK
567 #define GRND_NONBLOCK   0x0001
568 #endif
569 #ifndef GRND_RANDOM
570 #define GRND_RANDOM     0x0002
571 #endif
572
573 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
574                                                    struct syscall_arg *arg)
575 {
576         int printed = 0, flags = arg->val;
577
578 #define P_FLAG(n) \
579         if (flags & GRND_##n) { \
580                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
581                 flags &= ~GRND_##n; \
582         }
583
584         P_FLAG(RANDOM);
585         P_FLAG(NONBLOCK);
586 #undef P_FLAG
587
588         if (flags)
589                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
590
591         return printed;
592 }
593
594 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
595
596 #define STRARRAY(name, array) \
597           { .scnprintf  = SCA_STRARRAY, \
598             .parm       = &strarray__##array, }
599
600 #include "trace/beauty/arch_errno_names.c"
601 #include "trace/beauty/eventfd.c"
602 #include "trace/beauty/futex_op.c"
603 #include "trace/beauty/futex_val3.c"
604 #include "trace/beauty/mmap.c"
605 #include "trace/beauty/mode_t.c"
606 #include "trace/beauty/msg_flags.c"
607 #include "trace/beauty/open_flags.c"
608 #include "trace/beauty/perf_event_open.c"
609 #include "trace/beauty/pid.c"
610 #include "trace/beauty/sched_policy.c"
611 #include "trace/beauty/seccomp.c"
612 #include "trace/beauty/signum.c"
613 #include "trace/beauty/socket_type.c"
614 #include "trace/beauty/waitid_options.c"
615
616 struct syscall_arg_fmt {
617         size_t     (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
618         void       *parm;
619         const char *name;
620         bool       show_zero;
621 };
622
623 static struct syscall_fmt {
624         const char *name;
625         const char *alias;
626         struct syscall_arg_fmt arg[6];
627         u8         nr_args;
628         bool       errpid;
629         bool       timeout;
630         bool       hexret;
631 } syscall_fmts[] = {
632         { .name     = "access",
633           .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
634         { .name     = "bpf",
635           .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
636         { .name     = "brk",        .hexret = true,
637           .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
638         { .name     = "clock_gettime",
639           .arg = { [0] = STRARRAY(clk_id, clockid), }, },
640         { .name     = "clone",      .errpid = true, .nr_args = 5,
641           .arg = { [0] = { .name = "flags",         .scnprintf = SCA_CLONE_FLAGS, },
642                    [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
643                    [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
644                    [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
645                    [4] = { .name = "tls",           .scnprintf = SCA_HEX, }, }, },
646         { .name     = "close",
647           .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
648         { .name     = "epoll_ctl",
649           .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
650         { .name     = "eventfd2",
651           .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
652         { .name     = "fchmodat",
653           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
654         { .name     = "fchownat",
655           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
656         { .name     = "fcntl",
657           .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
658                            .parm      = &strarrays__fcntl_cmds_arrays,
659                            .show_zero = true, },
660                    [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
661         { .name     = "flock",
662           .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
663         { .name     = "fstat", .alias = "newfstat", },
664         { .name     = "fstatat", .alias = "newfstatat", },
665         { .name     = "futex",
666           .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
667                    [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
668         { .name     = "futimesat",
669           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
670         { .name     = "getitimer",
671           .arg = { [0] = STRARRAY(which, itimers), }, },
672         { .name     = "getpid",     .errpid = true, },
673         { .name     = "getpgid",    .errpid = true, },
674         { .name     = "getppid",    .errpid = true, },
675         { .name     = "getrandom",
676           .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
677         { .name     = "getrlimit",
678           .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
679         { .name     = "gettid",     .errpid = true, },
680         { .name     = "ioctl",
681           .arg = {
682 #if defined(__i386__) || defined(__x86_64__)
683 /*
684  * FIXME: Make this available to all arches.
685  */
686                    [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
687                    [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
688 #else
689                    [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
690 #endif
691         { .name     = "kcmp",       .nr_args = 5,
692           .arg = { [0] = { .name = "pid1",      .scnprintf = SCA_PID, },
693                    [1] = { .name = "pid2",      .scnprintf = SCA_PID, },
694                    [2] = { .name = "type",      .scnprintf = SCA_KCMP_TYPE, },
695                    [3] = { .name = "idx1",      .scnprintf = SCA_KCMP_IDX, },
696                    [4] = { .name = "idx2",      .scnprintf = SCA_KCMP_IDX, }, }, },
697         { .name     = "keyctl",
698           .arg = { [0] = STRARRAY(option, keyctl_options), }, },
699         { .name     = "kill",
700           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
701         { .name     = "linkat",
702           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
703         { .name     = "lseek",
704           .arg = { [2] = STRARRAY(whence, whences), }, },
705         { .name     = "lstat", .alias = "newlstat", },
706         { .name     = "madvise",
707           .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
708                    [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
709         { .name     = "mkdirat",
710           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
711         { .name     = "mknodat",
712           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
713         { .name     = "mlock",
714           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
715         { .name     = "mlockall",
716           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
717         { .name     = "mmap",       .hexret = true,
718 /* The standard mmap maps to old_mmap on s390x */
719 #if defined(__s390x__)
720         .alias = "old_mmap",
721 #endif
722           .arg = { [0] = { .scnprintf = SCA_HEX,        /* addr */ },
723                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
724                    [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, },
725         { .name     = "mprotect",
726           .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
727                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ }, }, },
728         { .name     = "mq_unlink",
729           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
730         { .name     = "mremap",     .hexret = true,
731           .arg = { [0] = { .scnprintf = SCA_HEX,          /* addr */ },
732                    [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
733                    [4] = { .scnprintf = SCA_HEX,          /* new_addr */ }, }, },
734         { .name     = "munlock",
735           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
736         { .name     = "munmap",
737           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
738         { .name     = "name_to_handle_at",
739           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
740         { .name     = "newfstatat",
741           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
742         { .name     = "open",
743           .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
744         { .name     = "open_by_handle_at",
745           .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
746                    [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
747         { .name     = "openat",
748           .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
749                    [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
750         { .name     = "perf_event_open",
751           .arg = { [2] = { .scnprintf = SCA_INT,        /* cpu */ },
752                    [3] = { .scnprintf = SCA_FD,         /* group_fd */ },
753                    [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
754         { .name     = "pipe2",
755           .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
756         { .name     = "pkey_alloc",
757           .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,   /* access_rights */ }, }, },
758         { .name     = "pkey_free",
759           .arg = { [0] = { .scnprintf = SCA_INT,        /* key */ }, }, },
760         { .name     = "pkey_mprotect",
761           .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
762                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
763                    [3] = { .scnprintf = SCA_INT,        /* pkey */ }, }, },
764         { .name     = "poll", .timeout = true, },
765         { .name     = "ppoll", .timeout = true, },
766         { .name     = "prctl", .alias = "arch_prctl",
767           .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
768                    [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
769                    [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
770         { .name     = "pread", .alias = "pread64", },
771         { .name     = "preadv", .alias = "pread", },
772         { .name     = "prlimit64",
773           .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
774         { .name     = "pwrite", .alias = "pwrite64", },
775         { .name     = "readlinkat",
776           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
777         { .name     = "recvfrom",
778           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
779         { .name     = "recvmmsg",
780           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
781         { .name     = "recvmsg",
782           .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
783         { .name     = "renameat",
784           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
785         { .name     = "rt_sigaction",
786           .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
787         { .name     = "rt_sigprocmask",
788           .arg = { [0] = STRARRAY(how, sighow), }, },
789         { .name     = "rt_sigqueueinfo",
790           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
791         { .name     = "rt_tgsigqueueinfo",
792           .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
793         { .name     = "sched_setscheduler",
794           .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
795         { .name     = "seccomp",
796           .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,    /* op */ },
797                    [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
798         { .name     = "select", .timeout = true, },
799         { .name     = "sendmmsg",
800           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
801         { .name     = "sendmsg",
802           .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
803         { .name     = "sendto",
804           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
805         { .name     = "set_tid_address", .errpid = true, },
806         { .name     = "setitimer",
807           .arg = { [0] = STRARRAY(which, itimers), }, },
808         { .name     = "setrlimit",
809           .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
810         { .name     = "socket",
811           .arg = { [0] = STRARRAY(family, socket_families),
812                    [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
813                    [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
814         { .name     = "socketpair",
815           .arg = { [0] = STRARRAY(family, socket_families),
816                    [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
817                    [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
818         { .name     = "stat", .alias = "newstat", },
819         { .name     = "statx",
820           .arg = { [0] = { .scnprintf = SCA_FDAT,        /* fdat */ },
821                    [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
822                    [3] = { .scnprintf = SCA_STATX_MASK,  /* mask */ }, }, },
823         { .name     = "swapoff",
824           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
825         { .name     = "swapon",
826           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
827         { .name     = "symlinkat",
828           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
829         { .name     = "tgkill",
830           .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
831         { .name     = "tkill",
832           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
833         { .name     = "uname", .alias = "newuname", },
834         { .name     = "unlinkat",
835           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
836         { .name     = "utimensat",
837           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
838         { .name     = "wait4",      .errpid = true,
839           .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
840         { .name     = "waitid",     .errpid = true,
841           .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
842 };
843
844 static int syscall_fmt__cmp(const void *name, const void *fmtp)
845 {
846         const struct syscall_fmt *fmt = fmtp;
847         return strcmp(name, fmt->name);
848 }
849
850 static struct syscall_fmt *syscall_fmt__find(const char *name)
851 {
852         const int nmemb = ARRAY_SIZE(syscall_fmts);
853         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
854 }
855
856 /*
857  * is_exit: is this "exit" or "exit_group"?
858  * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
859  */
860 struct syscall {
861         struct event_format *tp_format;
862         int                 nr_args;
863         bool                is_exit;
864         bool                is_open;
865         struct format_field *args;
866         const char          *name;
867         struct syscall_fmt  *fmt;
868         struct syscall_arg_fmt *arg_fmt;
869 };
870
871 /*
872  * We need to have this 'calculated' boolean because in some cases we really
873  * don't know what is the duration of a syscall, for instance, when we start
874  * a session and some threads are waiting for a syscall to finish, say 'poll',
875  * in which case all we can do is to print "( ? ) for duration and for the
876  * start timestamp.
877  */
878 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
879 {
880         double duration = (double)t / NSEC_PER_MSEC;
881         size_t printed = fprintf(fp, "(");
882
883         if (!calculated)
884                 printed += fprintf(fp, "         ");
885         else if (duration >= 1.0)
886                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
887         else if (duration >= 0.01)
888                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
889         else
890                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
891         return printed + fprintf(fp, "): ");
892 }
893
894 /**
895  * filename.ptr: The filename char pointer that will be vfs_getname'd
896  * filename.entry_str_pos: Where to insert the string translated from
897  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
898  * ret_scnprintf: syscall args may set this to a different syscall return
899  *                formatter, for instance, fcntl may return fds, file flags, etc.
900  */
901 struct thread_trace {
902         u64               entry_time;
903         bool              entry_pending;
904         unsigned long     nr_events;
905         unsigned long     pfmaj, pfmin;
906         char              *entry_str;
907         double            runtime_ms;
908         size_t            (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
909         struct {
910                 unsigned long ptr;
911                 short int     entry_str_pos;
912                 bool          pending_open;
913                 unsigned int  namelen;
914                 char          *name;
915         } filename;
916         struct {
917                 int       max;
918                 char      **table;
919         } paths;
920
921         struct intlist *syscall_stats;
922 };
923
924 static struct thread_trace *thread_trace__new(void)
925 {
926         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
927
928         if (ttrace)
929                 ttrace->paths.max = -1;
930
931         ttrace->syscall_stats = intlist__new(NULL);
932
933         return ttrace;
934 }
935
936 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
937 {
938         struct thread_trace *ttrace;
939
940         if (thread == NULL)
941                 goto fail;
942
943         if (thread__priv(thread) == NULL)
944                 thread__set_priv(thread, thread_trace__new());
945
946         if (thread__priv(thread) == NULL)
947                 goto fail;
948
949         ttrace = thread__priv(thread);
950         ++ttrace->nr_events;
951
952         return ttrace;
953 fail:
954         color_fprintf(fp, PERF_COLOR_RED,
955                       "WARNING: not enough memory, dropping samples!\n");
956         return NULL;
957 }
958
959
960 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
961                                     size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
962 {
963         struct thread_trace *ttrace = thread__priv(arg->thread);
964
965         ttrace->ret_scnprintf = ret_scnprintf;
966 }
967
968 #define TRACE_PFMAJ             (1 << 0)
969 #define TRACE_PFMIN             (1 << 1)
970
971 static const size_t trace__entry_str_size = 2048;
972
973 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
974 {
975         struct thread_trace *ttrace = thread__priv(thread);
976
977         if (fd > ttrace->paths.max) {
978                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
979
980                 if (npath == NULL)
981                         return -1;
982
983                 if (ttrace->paths.max != -1) {
984                         memset(npath + ttrace->paths.max + 1, 0,
985                                (fd - ttrace->paths.max) * sizeof(char *));
986                 } else {
987                         memset(npath, 0, (fd + 1) * sizeof(char *));
988                 }
989
990                 ttrace->paths.table = npath;
991                 ttrace->paths.max   = fd;
992         }
993
994         ttrace->paths.table[fd] = strdup(pathname);
995
996         return ttrace->paths.table[fd] != NULL ? 0 : -1;
997 }
998
999 static int thread__read_fd_path(struct thread *thread, int fd)
1000 {
1001         char linkname[PATH_MAX], pathname[PATH_MAX];
1002         struct stat st;
1003         int ret;
1004
1005         if (thread->pid_ == thread->tid) {
1006                 scnprintf(linkname, sizeof(linkname),
1007                           "/proc/%d/fd/%d", thread->pid_, fd);
1008         } else {
1009                 scnprintf(linkname, sizeof(linkname),
1010                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1011         }
1012
1013         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1014                 return -1;
1015
1016         ret = readlink(linkname, pathname, sizeof(pathname));
1017
1018         if (ret < 0 || ret > st.st_size)
1019                 return -1;
1020
1021         pathname[ret] = '\0';
1022         return trace__set_fd_pathname(thread, fd, pathname);
1023 }
1024
1025 static const char *thread__fd_path(struct thread *thread, int fd,
1026                                    struct trace *trace)
1027 {
1028         struct thread_trace *ttrace = thread__priv(thread);
1029
1030         if (ttrace == NULL)
1031                 return NULL;
1032
1033         if (fd < 0)
1034                 return NULL;
1035
1036         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1037                 if (!trace->live)
1038                         return NULL;
1039                 ++trace->stats.proc_getname;
1040                 if (thread__read_fd_path(thread, fd))
1041                         return NULL;
1042         }
1043
1044         return ttrace->paths.table[fd];
1045 }
1046
1047 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
1048 {
1049         int fd = arg->val;
1050         size_t printed = scnprintf(bf, size, "%d", fd);
1051         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1052
1053         if (path)
1054                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1055
1056         return printed;
1057 }
1058
1059 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1060 {
1061         size_t printed = scnprintf(bf, size, "%d", fd);
1062         struct thread *thread = machine__find_thread(trace->host, pid, pid);
1063
1064         if (thread) {
1065                 const char *path = thread__fd_path(thread, fd, trace);
1066
1067                 if (path)
1068                         printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1069
1070                 thread__put(thread);
1071         }
1072
1073         return printed;
1074 }
1075
1076 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1077                                               struct syscall_arg *arg)
1078 {
1079         int fd = arg->val;
1080         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1081         struct thread_trace *ttrace = thread__priv(arg->thread);
1082
1083         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1084                 zfree(&ttrace->paths.table[fd]);
1085
1086         return printed;
1087 }
1088
1089 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1090                                      unsigned long ptr)
1091 {
1092         struct thread_trace *ttrace = thread__priv(thread);
1093
1094         ttrace->filename.ptr = ptr;
1095         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1096 }
1097
1098 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1099                                               struct syscall_arg *arg)
1100 {
1101         unsigned long ptr = arg->val;
1102
1103         if (!arg->trace->vfs_getname)
1104                 return scnprintf(bf, size, "%#x", ptr);
1105
1106         thread__set_filename_pos(arg->thread, bf, ptr);
1107         return 0;
1108 }
1109
1110 static bool trace__filter_duration(struct trace *trace, double t)
1111 {
1112         return t < (trace->duration_filter * NSEC_PER_MSEC);
1113 }
1114
1115 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1116 {
1117         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1118
1119         return fprintf(fp, "%10.3f ", ts);
1120 }
1121
1122 /*
1123  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1124  * using ttrace->entry_time for a thread that receives a sys_exit without
1125  * first having received a sys_enter ("poll" issued before tracing session
1126  * starts, lost sys_enter exit due to ring buffer overflow).
1127  */
1128 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1129 {
1130         if (tstamp > 0)
1131                 return __trace__fprintf_tstamp(trace, tstamp, fp);
1132
1133         return fprintf(fp, "         ? ");
1134 }
1135
1136 static bool done = false;
1137 static bool interrupted = false;
1138
1139 static void sig_handler(int sig)
1140 {
1141         done = true;
1142         interrupted = sig == SIGINT;
1143 }
1144
1145 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1146                                         u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1147 {
1148         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1149         printed += fprintf_duration(duration, duration_calculated, fp);
1150
1151         if (trace->multiple_threads) {
1152                 if (trace->show_comm)
1153                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1154                 printed += fprintf(fp, "%d ", thread->tid);
1155         }
1156
1157         return printed;
1158 }
1159
1160 static int trace__process_event(struct trace *trace, struct machine *machine,
1161                                 union perf_event *event, struct perf_sample *sample)
1162 {
1163         int ret = 0;
1164
1165         switch (event->header.type) {
1166         case PERF_RECORD_LOST:
1167                 color_fprintf(trace->output, PERF_COLOR_RED,
1168                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1169                 ret = machine__process_lost_event(machine, event, sample);
1170                 break;
1171         default:
1172                 ret = machine__process_event(machine, event, sample);
1173                 break;
1174         }
1175
1176         return ret;
1177 }
1178
1179 static int trace__tool_process(struct perf_tool *tool,
1180                                union perf_event *event,
1181                                struct perf_sample *sample,
1182                                struct machine *machine)
1183 {
1184         struct trace *trace = container_of(tool, struct trace, tool);
1185         return trace__process_event(trace, machine, event, sample);
1186 }
1187
1188 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1189 {
1190         struct machine *machine = vmachine;
1191
1192         if (machine->kptr_restrict_warned)
1193                 return NULL;
1194
1195         if (symbol_conf.kptr_restrict) {
1196                 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1197                            "Check /proc/sys/kernel/kptr_restrict.\n\n"
1198                            "Kernel samples will not be resolved.\n");
1199                 machine->kptr_restrict_warned = true;
1200                 return NULL;
1201         }
1202
1203         return machine__resolve_kernel_addr(vmachine, addrp, modp);
1204 }
1205
1206 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1207 {
1208         int err = symbol__init(NULL);
1209
1210         if (err)
1211                 return err;
1212
1213         trace->host = machine__new_host();
1214         if (trace->host == NULL)
1215                 return -ENOMEM;
1216
1217         err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1218         if (err < 0)
1219                 goto out;
1220
1221         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1222                                             evlist->threads, trace__tool_process, false,
1223                                             trace->opts.proc_map_timeout, 1);
1224 out:
1225         if (err)
1226                 symbol__exit();
1227
1228         return err;
1229 }
1230
1231 static void trace__symbols__exit(struct trace *trace)
1232 {
1233         machine__exit(trace->host);
1234         trace->host = NULL;
1235
1236         symbol__exit();
1237 }
1238
1239 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1240 {
1241         int idx;
1242
1243         if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1244                 nr_args = sc->fmt->nr_args;
1245
1246         sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1247         if (sc->arg_fmt == NULL)
1248                 return -1;
1249
1250         for (idx = 0; idx < nr_args; ++idx) {
1251                 if (sc->fmt)
1252                         sc->arg_fmt[idx] = sc->fmt->arg[idx];
1253         }
1254
1255         sc->nr_args = nr_args;
1256         return 0;
1257 }
1258
1259 static int syscall__set_arg_fmts(struct syscall *sc)
1260 {
1261         struct format_field *field;
1262         int idx = 0, len;
1263
1264         for (field = sc->args; field; field = field->next, ++idx) {
1265                 if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1266                         continue;
1267
1268                 if (strcmp(field->type, "const char *") == 0 &&
1269                          (strcmp(field->name, "filename") == 0 ||
1270                           strcmp(field->name, "path") == 0 ||
1271                           strcmp(field->name, "pathname") == 0))
1272                         sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1273                 else if (field->flags & FIELD_IS_POINTER)
1274                         sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1275                 else if (strcmp(field->type, "pid_t") == 0)
1276                         sc->arg_fmt[idx].scnprintf = SCA_PID;
1277                 else if (strcmp(field->type, "umode_t") == 0)
1278                         sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1279                 else if ((strcmp(field->type, "int") == 0 ||
1280                           strcmp(field->type, "unsigned int") == 0 ||
1281                           strcmp(field->type, "long") == 0) &&
1282                          (len = strlen(field->name)) >= 2 &&
1283                          strcmp(field->name + len - 2, "fd") == 0) {
1284                         /*
1285                          * /sys/kernel/tracing/events/syscalls/sys_enter*
1286                          * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1287                          * 65 int
1288                          * 23 unsigned int
1289                          * 7 unsigned long
1290                          */
1291                         sc->arg_fmt[idx].scnprintf = SCA_FD;
1292                 }
1293         }
1294
1295         return 0;
1296 }
1297
1298 static int trace__read_syscall_info(struct trace *trace, int id)
1299 {
1300         char tp_name[128];
1301         struct syscall *sc;
1302         const char *name = syscalltbl__name(trace->sctbl, id);
1303
1304         if (name == NULL)
1305                 return -1;
1306
1307         if (id > trace->syscalls.max) {
1308                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1309
1310                 if (nsyscalls == NULL)
1311                         return -1;
1312
1313                 if (trace->syscalls.max != -1) {
1314                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1315                                (id - trace->syscalls.max) * sizeof(*sc));
1316                 } else {
1317                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1318                 }
1319
1320                 trace->syscalls.table = nsyscalls;
1321                 trace->syscalls.max   = id;
1322         }
1323
1324         sc = trace->syscalls.table + id;
1325         sc->name = name;
1326
1327         sc->fmt  = syscall_fmt__find(sc->name);
1328
1329         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1330         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1331
1332         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1333                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1334                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1335         }
1336
1337         if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1338                 return -1;
1339
1340         if (IS_ERR(sc->tp_format))
1341                 return -1;
1342
1343         sc->args = sc->tp_format->format.fields;
1344         /*
1345          * We need to check and discard the first variable '__syscall_nr'
1346          * or 'nr' that mean the syscall number. It is needless here.
1347          * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1348          */
1349         if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1350                 sc->args = sc->args->next;
1351                 --sc->nr_args;
1352         }
1353
1354         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1355         sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
1356
1357         return syscall__set_arg_fmts(sc);
1358 }
1359
1360 static int trace__validate_ev_qualifier(struct trace *trace)
1361 {
1362         int err = 0, i;
1363         size_t nr_allocated;
1364         struct str_node *pos;
1365
1366         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1367         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1368                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1369
1370         if (trace->ev_qualifier_ids.entries == NULL) {
1371                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1372                        trace->output);
1373                 err = -EINVAL;
1374                 goto out;
1375         }
1376
1377         nr_allocated = trace->ev_qualifier_ids.nr;
1378         i = 0;
1379
1380         strlist__for_each_entry(pos, trace->ev_qualifier) {
1381                 const char *sc = pos->s;
1382                 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1383
1384                 if (id < 0) {
1385                         id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1386                         if (id >= 0)
1387                                 goto matches;
1388
1389                         if (err == 0) {
1390                                 fputs("Error:\tInvalid syscall ", trace->output);
1391                                 err = -EINVAL;
1392                         } else {
1393                                 fputs(", ", trace->output);
1394                         }
1395
1396                         fputs(sc, trace->output);
1397                 }
1398 matches:
1399                 trace->ev_qualifier_ids.entries[i++] = id;
1400                 if (match_next == -1)
1401                         continue;
1402
1403                 while (1) {
1404                         id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1405                         if (id < 0)
1406                                 break;
1407                         if (nr_allocated == trace->ev_qualifier_ids.nr) {
1408                                 void *entries;
1409
1410                                 nr_allocated += 8;
1411                                 entries = realloc(trace->ev_qualifier_ids.entries,
1412                                                   nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1413                                 if (entries == NULL) {
1414                                         err = -ENOMEM;
1415                                         fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1416                                         goto out_free;
1417                                 }
1418                                 trace->ev_qualifier_ids.entries = entries;
1419                         }
1420                         trace->ev_qualifier_ids.nr++;
1421                         trace->ev_qualifier_ids.entries[i++] = id;
1422                 }
1423         }
1424
1425         if (err < 0) {
1426                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1427                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1428 out_free:
1429                 zfree(&trace->ev_qualifier_ids.entries);
1430                 trace->ev_qualifier_ids.nr = 0;
1431         }
1432 out:
1433         return err;
1434 }
1435
1436 /*
1437  * args is to be interpreted as a series of longs but we need to handle
1438  * 8-byte unaligned accesses. args points to raw_data within the event
1439  * and raw_data is guaranteed to be 8-byte unaligned because it is
1440  * preceded by raw_size which is a u32. So we need to copy args to a temp
1441  * variable to read it. Most notably this avoids extended load instructions
1442  * on unaligned addresses
1443  */
1444 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1445 {
1446         unsigned long val;
1447         unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1448
1449         memcpy(&val, p, sizeof(val));
1450         return val;
1451 }
1452
1453 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1454                                       struct syscall_arg *arg)
1455 {
1456         if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1457                 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1458
1459         return scnprintf(bf, size, "arg%d: ", arg->idx);
1460 }
1461
1462 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1463                                      struct syscall_arg *arg, unsigned long val)
1464 {
1465         if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1466                 arg->val = val;
1467                 if (sc->arg_fmt[arg->idx].parm)
1468                         arg->parm = sc->arg_fmt[arg->idx].parm;
1469                 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1470         }
1471         return scnprintf(bf, size, "%ld", val);
1472 }
1473
1474 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1475                                       unsigned char *args, struct trace *trace,
1476                                       struct thread *thread)
1477 {
1478         size_t printed = 0;
1479         unsigned long val;
1480         u8 bit = 1;
1481         struct syscall_arg arg = {
1482                 .args   = args,
1483                 .idx    = 0,
1484                 .mask   = 0,
1485                 .trace  = trace,
1486                 .thread = thread,
1487         };
1488         struct thread_trace *ttrace = thread__priv(thread);
1489
1490         /*
1491          * Things like fcntl will set this in its 'cmd' formatter to pick the
1492          * right formatter for the return value (an fd? file flags?), which is
1493          * not needed for syscalls that always return a given type, say an fd.
1494          */
1495         ttrace->ret_scnprintf = NULL;
1496
1497         if (sc->args != NULL) {
1498                 struct format_field *field;
1499
1500                 for (field = sc->args; field;
1501                      field = field->next, ++arg.idx, bit <<= 1) {
1502                         if (arg.mask & bit)
1503                                 continue;
1504
1505                         val = syscall_arg__val(&arg, arg.idx);
1506
1507                         /*
1508                          * Suppress this argument if its value is zero and
1509                          * and we don't have a string associated in an
1510                          * strarray for it.
1511                          */
1512                         if (val == 0 &&
1513                             !(sc->arg_fmt &&
1514                               (sc->arg_fmt[arg.idx].show_zero ||
1515                                sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1516                                sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1517                               sc->arg_fmt[arg.idx].parm))
1518                                 continue;
1519
1520                         printed += scnprintf(bf + printed, size - printed,
1521                                              "%s%s: ", printed ? ", " : "", field->name);
1522                         printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1523                 }
1524         } else if (IS_ERR(sc->tp_format)) {
1525                 /*
1526                  * If we managed to read the tracepoint /format file, then we
1527                  * may end up not having any args, like with gettid(), so only
1528                  * print the raw args when we didn't manage to read it.
1529                  */
1530                 while (arg.idx < sc->nr_args) {
1531                         if (arg.mask & bit)
1532                                 goto next_arg;
1533                         val = syscall_arg__val(&arg, arg.idx);
1534                         if (printed)
1535                                 printed += scnprintf(bf + printed, size - printed, ", ");
1536                         printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1537                         printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1538 next_arg:
1539                         ++arg.idx;
1540                         bit <<= 1;
1541                 }
1542         }
1543
1544         return printed;
1545 }
1546
1547 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1548                                   union perf_event *event,
1549                                   struct perf_sample *sample);
1550
1551 static struct syscall *trace__syscall_info(struct trace *trace,
1552                                            struct perf_evsel *evsel, int id)
1553 {
1554
1555         if (id < 0) {
1556
1557                 /*
1558                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1559                  * before that, leaving at a higher verbosity level till that is
1560                  * explained. Reproduced with plain ftrace with:
1561                  *
1562                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1563                  * grep "NR -1 " /t/trace_pipe
1564                  *
1565                  * After generating some load on the machine.
1566                  */
1567                 if (verbose > 1) {
1568                         static u64 n;
1569                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1570                                 id, perf_evsel__name(evsel), ++n);
1571                 }
1572                 return NULL;
1573         }
1574
1575         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1576             trace__read_syscall_info(trace, id))
1577                 goto out_cant_read;
1578
1579         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1580                 goto out_cant_read;
1581
1582         return &trace->syscalls.table[id];
1583
1584 out_cant_read:
1585         if (verbose > 0) {
1586                 fprintf(trace->output, "Problems reading syscall %d", id);
1587                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1588                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1589                 fputs(" information\n", trace->output);
1590         }
1591         return NULL;
1592 }
1593
1594 static void thread__update_stats(struct thread_trace *ttrace,
1595                                  int id, struct perf_sample *sample)
1596 {
1597         struct int_node *inode;
1598         struct stats *stats;
1599         u64 duration = 0;
1600
1601         inode = intlist__findnew(ttrace->syscall_stats, id);
1602         if (inode == NULL)
1603                 return;
1604
1605         stats = inode->priv;
1606         if (stats == NULL) {
1607                 stats = malloc(sizeof(struct stats));
1608                 if (stats == NULL)
1609                         return;
1610                 init_stats(stats);
1611                 inode->priv = stats;
1612         }
1613
1614         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1615                 duration = sample->time - ttrace->entry_time;
1616
1617         update_stats(stats, duration);
1618 }
1619
1620 static int trace__printf_interrupted_entry(struct trace *trace)
1621 {
1622         struct thread_trace *ttrace;
1623         size_t printed;
1624
1625         if (trace->failure_only || trace->current == NULL)
1626                 return 0;
1627
1628         ttrace = thread__priv(trace->current);
1629
1630         if (!ttrace->entry_pending)
1631                 return 0;
1632
1633         printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1634         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1635         ttrace->entry_pending = false;
1636
1637         return printed;
1638 }
1639
1640 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1641                                  struct perf_sample *sample, struct thread *thread)
1642 {
1643         int printed = 0;
1644
1645         if (trace->print_sample) {
1646                 double ts = (double)sample->time / NSEC_PER_MSEC;
1647
1648                 printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1649                                    perf_evsel__name(evsel), ts,
1650                                    thread__comm_str(thread),
1651                                    sample->pid, sample->tid, sample->cpu);
1652         }
1653
1654         return printed;
1655 }
1656
1657 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1658                             union perf_event *event __maybe_unused,
1659                             struct perf_sample *sample)
1660 {
1661         char *msg;
1662         void *args;
1663         size_t printed = 0;
1664         struct thread *thread;
1665         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1666         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1667         struct thread_trace *ttrace;
1668
1669         if (sc == NULL)
1670                 return -1;
1671
1672         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1673         ttrace = thread__trace(thread, trace->output);
1674         if (ttrace == NULL)
1675                 goto out_put;
1676
1677         trace__fprintf_sample(trace, evsel, sample, thread);
1678
1679         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1680
1681         if (ttrace->entry_str == NULL) {
1682                 ttrace->entry_str = malloc(trace__entry_str_size);
1683                 if (!ttrace->entry_str)
1684                         goto out_put;
1685         }
1686
1687         if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1688                 trace__printf_interrupted_entry(trace);
1689
1690         ttrace->entry_time = sample->time;
1691         msg = ttrace->entry_str;
1692         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1693
1694         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1695                                            args, trace, thread);
1696
1697         if (sc->is_exit) {
1698                 if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1699                         trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1700                         fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1701                 }
1702         } else {
1703                 ttrace->entry_pending = true;
1704                 /* See trace__vfs_getname & trace__sys_exit */
1705                 ttrace->filename.pending_open = false;
1706         }
1707
1708         if (trace->current != thread) {
1709                 thread__put(trace->current);
1710                 trace->current = thread__get(thread);
1711         }
1712         err = 0;
1713 out_put:
1714         thread__put(thread);
1715         return err;
1716 }
1717
1718 static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evsel,
1719                                     struct perf_sample *sample)
1720 {
1721         struct thread_trace *ttrace;
1722         struct thread *thread;
1723         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1724         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1725         char msg[1024];
1726         void *args;
1727
1728         if (sc == NULL)
1729                 return -1;
1730
1731         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1732         ttrace = thread__trace(thread, trace->output);
1733         /*
1734          * We need to get ttrace just to make sure it is there when syscall__scnprintf_args()
1735          * and the rest of the beautifiers accessing it via struct syscall_arg touches it.
1736          */
1737         if (ttrace == NULL)
1738                 goto out_put;
1739
1740         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1741         syscall__scnprintf_args(sc, msg, sizeof(msg), args, trace, thread);
1742         fprintf(trace->output, "%s", msg);
1743         err = 0;
1744 out_put:
1745         thread__put(thread);
1746         return err;
1747 }
1748
1749 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1750                                     struct perf_sample *sample,
1751                                     struct callchain_cursor *cursor)
1752 {
1753         struct addr_location al;
1754         int max_stack = evsel->attr.sample_max_stack ?
1755                         evsel->attr.sample_max_stack :
1756                         trace->max_stack;
1757
1758         if (machine__resolve(trace->host, &al, sample) < 0 ||
1759             thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1760                 return -1;
1761
1762         return 0;
1763 }
1764
1765 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1766 {
1767         /* TODO: user-configurable print_opts */
1768         const unsigned int print_opts = EVSEL__PRINT_SYM |
1769                                         EVSEL__PRINT_DSO |
1770                                         EVSEL__PRINT_UNKNOWN_AS_ADDR;
1771
1772         return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1773 }
1774
1775 static const char *errno_to_name(struct perf_evsel *evsel, int err)
1776 {
1777         struct perf_env *env = perf_evsel__env(evsel);
1778         const char *arch_name = perf_env__arch(env);
1779
1780         return arch_syscalls__strerrno(arch_name, err);
1781 }
1782
1783 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1784                            union perf_event *event __maybe_unused,
1785                            struct perf_sample *sample)
1786 {
1787         long ret;
1788         u64 duration = 0;
1789         bool duration_calculated = false;
1790         struct thread *thread;
1791         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1792         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1793         struct thread_trace *ttrace;
1794
1795         if (sc == NULL)
1796                 return -1;
1797
1798         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1799         ttrace = thread__trace(thread, trace->output);
1800         if (ttrace == NULL)
1801                 goto out_put;
1802
1803         trace__fprintf_sample(trace, evsel, sample, thread);
1804
1805         if (trace->summary)
1806                 thread__update_stats(ttrace, id, sample);
1807
1808         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1809
1810         if (sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
1811                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1812                 ttrace->filename.pending_open = false;
1813                 ++trace->stats.vfs_getname;
1814         }
1815
1816         if (ttrace->entry_time) {
1817                 duration = sample->time - ttrace->entry_time;
1818                 if (trace__filter_duration(trace, duration))
1819                         goto out;
1820                 duration_calculated = true;
1821         } else if (trace->duration_filter)
1822                 goto out;
1823
1824         if (sample->callchain) {
1825                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1826                 if (callchain_ret == 0) {
1827                         if (callchain_cursor.nr < trace->min_stack)
1828                                 goto out;
1829                         callchain_ret = 1;
1830                 }
1831         }
1832
1833         if (trace->summary_only || (ret >= 0 && trace->failure_only))
1834                 goto out;
1835
1836         trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1837
1838         if (ttrace->entry_pending) {
1839                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1840         } else {
1841                 fprintf(trace->output, " ... [");
1842                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1843                 fprintf(trace->output, "]: %s()", sc->name);
1844         }
1845
1846         if (sc->fmt == NULL) {
1847                 if (ret < 0)
1848                         goto errno_print;
1849 signed_print:
1850                 fprintf(trace->output, ") = %ld", ret);
1851         } else if (ret < 0) {
1852 errno_print: {
1853                 char bf[STRERR_BUFSIZE];
1854                 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1855                            *e = errno_to_name(evsel, -ret);
1856
1857                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1858         }
1859         } else if (ret == 0 && sc->fmt->timeout)
1860                 fprintf(trace->output, ") = 0 Timeout");
1861         else if (ttrace->ret_scnprintf) {
1862                 char bf[1024];
1863                 struct syscall_arg arg = {
1864                         .val    = ret,
1865                         .thread = thread,
1866                         .trace  = trace,
1867                 };
1868                 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1869                 ttrace->ret_scnprintf = NULL;
1870                 fprintf(trace->output, ") = %s", bf);
1871         } else if (sc->fmt->hexret)
1872                 fprintf(trace->output, ") = %#lx", ret);
1873         else if (sc->fmt->errpid) {
1874                 struct thread *child = machine__find_thread(trace->host, ret, ret);
1875
1876                 if (child != NULL) {
1877                         fprintf(trace->output, ") = %ld", ret);
1878                         if (child->comm_set)
1879                                 fprintf(trace->output, " (%s)", thread__comm_str(child));
1880                         thread__put(child);
1881                 }
1882         } else
1883                 goto signed_print;
1884
1885         fputc('\n', trace->output);
1886
1887         if (callchain_ret > 0)
1888                 trace__fprintf_callchain(trace, sample);
1889         else if (callchain_ret < 0)
1890                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1891 out:
1892         ttrace->entry_pending = false;
1893         err = 0;
1894 out_put:
1895         thread__put(thread);
1896         return err;
1897 }
1898
1899 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1900                               union perf_event *event __maybe_unused,
1901                               struct perf_sample *sample)
1902 {
1903         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1904         struct thread_trace *ttrace;
1905         size_t filename_len, entry_str_len, to_move;
1906         ssize_t remaining_space;
1907         char *pos;
1908         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1909
1910         if (!thread)
1911                 goto out;
1912
1913         ttrace = thread__priv(thread);
1914         if (!ttrace)
1915                 goto out_put;
1916
1917         filename_len = strlen(filename);
1918         if (filename_len == 0)
1919                 goto out_put;
1920
1921         if (ttrace->filename.namelen < filename_len) {
1922                 char *f = realloc(ttrace->filename.name, filename_len + 1);
1923
1924                 if (f == NULL)
1925                         goto out_put;
1926
1927                 ttrace->filename.namelen = filename_len;
1928                 ttrace->filename.name = f;
1929         }
1930
1931         strcpy(ttrace->filename.name, filename);
1932         ttrace->filename.pending_open = true;
1933
1934         if (!ttrace->filename.ptr)
1935                 goto out_put;
1936
1937         entry_str_len = strlen(ttrace->entry_str);
1938         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1939         if (remaining_space <= 0)
1940                 goto out_put;
1941
1942         if (filename_len > (size_t)remaining_space) {
1943                 filename += filename_len - remaining_space;
1944                 filename_len = remaining_space;
1945         }
1946
1947         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1948         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1949         memmove(pos + filename_len, pos, to_move);
1950         memcpy(pos, filename, filename_len);
1951
1952         ttrace->filename.ptr = 0;
1953         ttrace->filename.entry_str_pos = 0;
1954 out_put:
1955         thread__put(thread);
1956 out:
1957         return 0;
1958 }
1959
1960 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1961                                      union perf_event *event __maybe_unused,
1962                                      struct perf_sample *sample)
1963 {
1964         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1965         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1966         struct thread *thread = machine__findnew_thread(trace->host,
1967                                                         sample->pid,
1968                                                         sample->tid);
1969         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1970
1971         if (ttrace == NULL)
1972                 goto out_dump;
1973
1974         ttrace->runtime_ms += runtime_ms;
1975         trace->runtime_ms += runtime_ms;
1976 out_put:
1977         thread__put(thread);
1978         return 0;
1979
1980 out_dump:
1981         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1982                evsel->name,
1983                perf_evsel__strval(evsel, sample, "comm"),
1984                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1985                runtime,
1986                perf_evsel__intval(evsel, sample, "vruntime"));
1987         goto out_put;
1988 }
1989
1990 static int bpf_output__printer(enum binary_printer_ops op,
1991                                unsigned int val, void *extra __maybe_unused, FILE *fp)
1992 {
1993         unsigned char ch = (unsigned char)val;
1994
1995         switch (op) {
1996         case BINARY_PRINT_CHAR_DATA:
1997                 return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1998         case BINARY_PRINT_DATA_BEGIN:
1999         case BINARY_PRINT_LINE_BEGIN:
2000         case BINARY_PRINT_ADDR:
2001         case BINARY_PRINT_NUM_DATA:
2002         case BINARY_PRINT_NUM_PAD:
2003         case BINARY_PRINT_SEP:
2004         case BINARY_PRINT_CHAR_PAD:
2005         case BINARY_PRINT_LINE_END:
2006         case BINARY_PRINT_DATA_END:
2007         default:
2008                 break;
2009         }
2010
2011         return 0;
2012 }
2013
2014 static void bpf_output__fprintf(struct trace *trace,
2015                                 struct perf_sample *sample)
2016 {
2017         binary__fprintf(sample->raw_data, sample->raw_size, 8,
2018                         bpf_output__printer, NULL, trace->output);
2019 }
2020
2021 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
2022                                 union perf_event *event __maybe_unused,
2023                                 struct perf_sample *sample)
2024 {
2025         int callchain_ret = 0;
2026
2027         if (sample->callchain) {
2028                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2029                 if (callchain_ret == 0) {
2030                         if (callchain_cursor.nr < trace->min_stack)
2031                                 goto out;
2032                         callchain_ret = 1;
2033                 }
2034         }
2035
2036         trace__printf_interrupted_entry(trace);
2037         trace__fprintf_tstamp(trace, sample->time, trace->output);
2038
2039         if (trace->trace_syscalls)
2040                 fprintf(trace->output, "(         ): ");
2041
2042         fprintf(trace->output, "%s:", evsel->name);
2043
2044         if (perf_evsel__is_bpf_output(evsel)) {
2045                 if (evsel == trace->syscalls.events.augmented)
2046                         trace__fprintf_sys_enter(trace, evsel, sample);
2047                 else
2048                         bpf_output__fprintf(trace, sample);
2049         } else if (evsel->tp_format) {
2050                 if (strncmp(evsel->tp_format->name, "sys_enter_", 10) ||
2051                     trace__fprintf_sys_enter(trace, evsel, sample)) {
2052                         event_format__fprintf(evsel->tp_format, sample->cpu,
2053                                               sample->raw_data, sample->raw_size,
2054                                               trace->output);
2055                 }
2056         }
2057
2058         fprintf(trace->output, "\n");
2059
2060         if (callchain_ret > 0)
2061                 trace__fprintf_callchain(trace, sample);
2062         else if (callchain_ret < 0)
2063                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2064 out:
2065         return 0;
2066 }
2067
2068 static void print_location(FILE *f, struct perf_sample *sample,
2069                            struct addr_location *al,
2070                            bool print_dso, bool print_sym)
2071 {
2072
2073         if ((verbose > 0 || print_dso) && al->map)
2074                 fprintf(f, "%s@", al->map->dso->long_name);
2075
2076         if ((verbose > 0 || print_sym) && al->sym)
2077                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2078                         al->addr - al->sym->start);
2079         else if (al->map)
2080                 fprintf(f, "0x%" PRIx64, al->addr);
2081         else
2082                 fprintf(f, "0x%" PRIx64, sample->addr);
2083 }
2084
2085 static int trace__pgfault(struct trace *trace,
2086                           struct perf_evsel *evsel,
2087                           union perf_event *event __maybe_unused,
2088                           struct perf_sample *sample)
2089 {
2090         struct thread *thread;
2091         struct addr_location al;
2092         char map_type = 'd';
2093         struct thread_trace *ttrace;
2094         int err = -1;
2095         int callchain_ret = 0;
2096
2097         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2098
2099         if (sample->callchain) {
2100                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2101                 if (callchain_ret == 0) {
2102                         if (callchain_cursor.nr < trace->min_stack)
2103                                 goto out_put;
2104                         callchain_ret = 1;
2105                 }
2106         }
2107
2108         ttrace = thread__trace(thread, trace->output);
2109         if (ttrace == NULL)
2110                 goto out_put;
2111
2112         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2113                 ttrace->pfmaj++;
2114         else
2115                 ttrace->pfmin++;
2116
2117         if (trace->summary_only)
2118                 goto out;
2119
2120         thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
2121
2122         trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2123
2124         fprintf(trace->output, "%sfault [",
2125                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2126                 "maj" : "min");
2127
2128         print_location(trace->output, sample, &al, false, true);
2129
2130         fprintf(trace->output, "] => ");
2131
2132         thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2133
2134         if (!al.map) {
2135                 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2136
2137                 if (al.map)
2138                         map_type = 'x';
2139                 else
2140                         map_type = '?';
2141         }
2142
2143         print_location(trace->output, sample, &al, true, false);
2144
2145         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2146
2147         if (callchain_ret > 0)
2148                 trace__fprintf_callchain(trace, sample);
2149         else if (callchain_ret < 0)
2150                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2151 out:
2152         err = 0;
2153 out_put:
2154         thread__put(thread);
2155         return err;
2156 }
2157
2158 static void trace__set_base_time(struct trace *trace,
2159                                  struct perf_evsel *evsel,
2160                                  struct perf_sample *sample)
2161 {
2162         /*
2163          * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2164          * and don't use sample->time unconditionally, we may end up having
2165          * some other event in the future without PERF_SAMPLE_TIME for good
2166          * reason, i.e. we may not be interested in its timestamps, just in
2167          * it taking place, picking some piece of information when it
2168          * appears in our event stream (vfs_getname comes to mind).
2169          */
2170         if (trace->base_time == 0 && !trace->full_time &&
2171             (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2172                 trace->base_time = sample->time;
2173 }
2174
2175 static int trace__process_sample(struct perf_tool *tool,
2176                                  union perf_event *event,
2177                                  struct perf_sample *sample,
2178                                  struct perf_evsel *evsel,
2179                                  struct machine *machine __maybe_unused)
2180 {
2181         struct trace *trace = container_of(tool, struct trace, tool);
2182         struct thread *thread;
2183         int err = 0;
2184
2185         tracepoint_handler handler = evsel->handler;
2186
2187         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2188         if (thread && thread__is_filtered(thread))
2189                 goto out;
2190
2191         trace__set_base_time(trace, evsel, sample);
2192
2193         if (handler) {
2194                 ++trace->nr_events;
2195                 handler(trace, evsel, event, sample);
2196         }
2197 out:
2198         thread__put(thread);
2199         return err;
2200 }
2201
2202 static int trace__record(struct trace *trace, int argc, const char **argv)
2203 {
2204         unsigned int rec_argc, i, j;
2205         const char **rec_argv;
2206         const char * const record_args[] = {
2207                 "record",
2208                 "-R",
2209                 "-m", "1024",
2210                 "-c", "1",
2211         };
2212
2213         const char * const sc_args[] = { "-e", };
2214         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2215         const char * const majpf_args[] = { "-e", "major-faults" };
2216         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2217         const char * const minpf_args[] = { "-e", "minor-faults" };
2218         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2219
2220         /* +1 is for the event string below */
2221         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2222                 majpf_args_nr + minpf_args_nr + argc;
2223         rec_argv = calloc(rec_argc + 1, sizeof(char *));
2224
2225         if (rec_argv == NULL)
2226                 return -ENOMEM;
2227
2228         j = 0;
2229         for (i = 0; i < ARRAY_SIZE(record_args); i++)
2230                 rec_argv[j++] = record_args[i];
2231
2232         if (trace->trace_syscalls) {
2233                 for (i = 0; i < sc_args_nr; i++)
2234                         rec_argv[j++] = sc_args[i];
2235
2236                 /* event string may be different for older kernels - e.g., RHEL6 */
2237                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2238                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2239                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2240                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2241                 else {
2242                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2243                         free(rec_argv);
2244                         return -1;
2245                 }
2246         }
2247
2248         if (trace->trace_pgfaults & TRACE_PFMAJ)
2249                 for (i = 0; i < majpf_args_nr; i++)
2250                         rec_argv[j++] = majpf_args[i];
2251
2252         if (trace->trace_pgfaults & TRACE_PFMIN)
2253                 for (i = 0; i < minpf_args_nr; i++)
2254                         rec_argv[j++] = minpf_args[i];
2255
2256         for (i = 0; i < (unsigned int)argc; i++)
2257                 rec_argv[j++] = argv[i];
2258
2259         return cmd_record(j, rec_argv);
2260 }
2261
2262 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2263
2264 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2265 {
2266         bool found = false;
2267         struct perf_evsel *evsel, *tmp;
2268         struct parse_events_error err = { .idx = 0, };
2269         int ret = parse_events(evlist, "probe:vfs_getname*", &err);
2270
2271         if (ret)
2272                 return false;
2273
2274         evlist__for_each_entry_safe(evlist, evsel, tmp) {
2275                 if (!strstarts(perf_evsel__name(evsel), "probe:vfs_getname"))
2276                         continue;
2277
2278                 if (perf_evsel__field(evsel, "pathname")) {
2279                         evsel->handler = trace__vfs_getname;
2280                         found = true;
2281                         continue;
2282                 }
2283
2284                 list_del_init(&evsel->node);
2285                 evsel->evlist = NULL;
2286                 perf_evsel__delete(evsel);
2287         }
2288
2289         return found;
2290 }
2291
2292 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2293 {
2294         struct perf_evsel *evsel;
2295         struct perf_event_attr attr = {
2296                 .type = PERF_TYPE_SOFTWARE,
2297                 .mmap_data = 1,
2298         };
2299
2300         attr.config = config;
2301         attr.sample_period = 1;
2302
2303         event_attr_init(&attr);
2304
2305         evsel = perf_evsel__new(&attr);
2306         if (evsel)
2307                 evsel->handler = trace__pgfault;
2308
2309         return evsel;
2310 }
2311
2312 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2313 {
2314         const u32 type = event->header.type;
2315         struct perf_evsel *evsel;
2316
2317         if (type != PERF_RECORD_SAMPLE) {
2318                 trace__process_event(trace, trace->host, event, sample);
2319                 return;
2320         }
2321
2322         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2323         if (evsel == NULL) {
2324                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2325                 return;
2326         }
2327
2328         trace__set_base_time(trace, evsel, sample);
2329
2330         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2331             sample->raw_data == NULL) {
2332                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2333                        perf_evsel__name(evsel), sample->tid,
2334                        sample->cpu, sample->raw_size);
2335         } else {
2336                 tracepoint_handler handler = evsel->handler;
2337                 handler(trace, evsel, event, sample);
2338         }
2339 }
2340
2341 static int trace__add_syscall_newtp(struct trace *trace)
2342 {
2343         int ret = -1;
2344         struct perf_evlist *evlist = trace->evlist;
2345         struct perf_evsel *sys_enter, *sys_exit;
2346
2347         sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter);
2348         if (sys_enter == NULL)
2349                 goto out;
2350
2351         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2352                 goto out_delete_sys_enter;
2353
2354         sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit);
2355         if (sys_exit == NULL)
2356                 goto out_delete_sys_enter;
2357
2358         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2359                 goto out_delete_sys_exit;
2360
2361         perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2362         perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2363
2364         perf_evlist__add(evlist, sys_enter);
2365         perf_evlist__add(evlist, sys_exit);
2366
2367         if (callchain_param.enabled && !trace->kernel_syscallchains) {
2368                 /*
2369                  * We're interested only in the user space callchain
2370                  * leading to the syscall, allow overriding that for
2371                  * debugging reasons using --kernel_syscall_callchains
2372                  */
2373                 sys_exit->attr.exclude_callchain_kernel = 1;
2374         }
2375
2376         trace->syscalls.events.sys_enter = sys_enter;
2377         trace->syscalls.events.sys_exit  = sys_exit;
2378
2379         ret = 0;
2380 out:
2381         return ret;
2382
2383 out_delete_sys_exit:
2384         perf_evsel__delete_priv(sys_exit);
2385 out_delete_sys_enter:
2386         perf_evsel__delete_priv(sys_enter);
2387         goto out;
2388 }
2389
2390 static int trace__set_ev_qualifier_filter(struct trace *trace)
2391 {
2392         int err = -1;
2393         struct perf_evsel *sys_exit;
2394         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2395                                                 trace->ev_qualifier_ids.nr,
2396                                                 trace->ev_qualifier_ids.entries);
2397
2398         if (filter == NULL)
2399                 goto out_enomem;
2400
2401         if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2402                                           filter)) {
2403                 sys_exit = trace->syscalls.events.sys_exit;
2404                 err = perf_evsel__append_tp_filter(sys_exit, filter);
2405         }
2406
2407         free(filter);
2408 out:
2409         return err;
2410 out_enomem:
2411         errno = ENOMEM;
2412         goto out;
2413 }
2414
2415 static int trace__set_filter_loop_pids(struct trace *trace)
2416 {
2417         unsigned int nr = 1;
2418         pid_t pids[32] = {
2419                 getpid(),
2420         };
2421         struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2422
2423         while (thread && nr < ARRAY_SIZE(pids)) {
2424                 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2425
2426                 if (parent == NULL)
2427                         break;
2428
2429                 if (!strcmp(thread__comm_str(parent), "sshd")) {
2430                         pids[nr++] = parent->tid;
2431                         break;
2432                 }
2433                 thread = parent;
2434         }
2435
2436         return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2437 }
2438
2439 static int trace__run(struct trace *trace, int argc, const char **argv)
2440 {
2441         struct perf_evlist *evlist = trace->evlist;
2442         struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2443         int err = -1, i;
2444         unsigned long before;
2445         const bool forks = argc > 0;
2446         bool draining = false;
2447
2448         trace->live = true;
2449
2450         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2451                 goto out_error_raw_syscalls;
2452
2453         if (trace->trace_syscalls)
2454                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2455
2456         if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2457                 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2458                 if (pgfault_maj == NULL)
2459                         goto out_error_mem;
2460                 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2461                 perf_evlist__add(evlist, pgfault_maj);
2462         }
2463
2464         if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2465                 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2466                 if (pgfault_min == NULL)
2467                         goto out_error_mem;
2468                 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2469                 perf_evlist__add(evlist, pgfault_min);
2470         }
2471
2472         if (trace->sched &&
2473             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2474                                    trace__sched_stat_runtime))
2475                 goto out_error_sched_stat_runtime;
2476
2477         /*
2478          * If a global cgroup was set, apply it to all the events without an
2479          * explicit cgroup. I.e.:
2480          *
2481          *      trace -G A -e sched:*switch
2482          *
2483          * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2484          * _and_ sched:sched_switch to the 'A' cgroup, while:
2485          *
2486          * trace -e sched:*switch -G A
2487          *
2488          * will only set the sched:sched_switch event to the 'A' cgroup, all the
2489          * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
2490          * a cgroup (on the root cgroup, sys wide, etc).
2491          *
2492          * Multiple cgroups:
2493          *
2494          * trace -G A -e sched:*switch -G B
2495          *
2496          * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
2497          * to the 'B' cgroup.
2498          *
2499          * evlist__set_default_cgroup() grabs a reference of the passed cgroup
2500          * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
2501          */
2502         if (trace->cgroup)
2503                 evlist__set_default_cgroup(trace->evlist, trace->cgroup);
2504
2505         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2506         if (err < 0) {
2507                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2508                 goto out_delete_evlist;
2509         }
2510
2511         err = trace__symbols_init(trace, evlist);
2512         if (err < 0) {
2513                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2514                 goto out_delete_evlist;
2515         }
2516
2517         perf_evlist__config(evlist, &trace->opts, &callchain_param);
2518
2519         signal(SIGCHLD, sig_handler);
2520         signal(SIGINT, sig_handler);
2521
2522         if (forks) {
2523                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2524                                                     argv, false, NULL);
2525                 if (err < 0) {
2526                         fprintf(trace->output, "Couldn't run the workload!\n");
2527                         goto out_delete_evlist;
2528                 }
2529         }
2530
2531         err = perf_evlist__open(evlist);
2532         if (err < 0)
2533                 goto out_error_open;
2534
2535         err = bpf__apply_obj_config();
2536         if (err) {
2537                 char errbuf[BUFSIZ];
2538
2539                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2540                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2541                          errbuf);
2542                 goto out_error_open;
2543         }
2544
2545         /*
2546          * Better not use !target__has_task() here because we need to cover the
2547          * case where no threads were specified in the command line, but a
2548          * workload was, and in that case we will fill in the thread_map when
2549          * we fork the workload in perf_evlist__prepare_workload.
2550          */
2551         if (trace->filter_pids.nr > 0)
2552                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2553         else if (thread_map__pid(evlist->threads, 0) == -1)
2554                 err = trace__set_filter_loop_pids(trace);
2555
2556         if (err < 0)
2557                 goto out_error_mem;
2558
2559         if (trace->ev_qualifier_ids.nr > 0) {
2560                 err = trace__set_ev_qualifier_filter(trace);
2561                 if (err < 0)
2562                         goto out_errno;
2563
2564                 pr_debug("event qualifier tracepoint filter: %s\n",
2565                          trace->syscalls.events.sys_exit->filter);
2566         }
2567
2568         err = perf_evlist__apply_filters(evlist, &evsel);
2569         if (err < 0)
2570                 goto out_error_apply_filters;
2571
2572         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2573         if (err < 0)
2574                 goto out_error_mmap;
2575
2576         if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2577                 perf_evlist__enable(evlist);
2578
2579         if (forks)
2580                 perf_evlist__start_workload(evlist);
2581
2582         if (trace->opts.initial_delay) {
2583                 usleep(trace->opts.initial_delay * 1000);
2584                 perf_evlist__enable(evlist);
2585         }
2586
2587         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2588                                   evlist->threads->nr > 1 ||
2589                                   perf_evlist__first(evlist)->attr.inherit;
2590
2591         /*
2592          * Now that we already used evsel->attr to ask the kernel to setup the
2593          * events, lets reuse evsel->attr.sample_max_stack as the limit in
2594          * trace__resolve_callchain(), allowing per-event max-stack settings
2595          * to override an explicitely set --max-stack global setting.
2596          */
2597         evlist__for_each_entry(evlist, evsel) {
2598                 if (evsel__has_callchain(evsel) &&
2599                     evsel->attr.sample_max_stack == 0)
2600                         evsel->attr.sample_max_stack = trace->max_stack;
2601         }
2602 again:
2603         before = trace->nr_events;
2604
2605         for (i = 0; i < evlist->nr_mmaps; i++) {
2606                 union perf_event *event;
2607                 struct perf_mmap *md;
2608
2609                 md = &evlist->mmap[i];
2610                 if (perf_mmap__read_init(md) < 0)
2611                         continue;
2612
2613                 while ((event = perf_mmap__read_event(md)) != NULL) {
2614                         struct perf_sample sample;
2615
2616                         ++trace->nr_events;
2617
2618                         err = perf_evlist__parse_sample(evlist, event, &sample);
2619                         if (err) {
2620                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2621                                 goto next_event;
2622                         }
2623
2624                         trace__handle_event(trace, event, &sample);
2625 next_event:
2626                         perf_mmap__consume(md);
2627
2628                         if (interrupted)
2629                                 goto out_disable;
2630
2631                         if (done && !draining) {
2632                                 perf_evlist__disable(evlist);
2633                                 draining = true;
2634                         }
2635                 }
2636                 perf_mmap__read_done(md);
2637         }
2638
2639         if (trace->nr_events == before) {
2640                 int timeout = done ? 100 : -1;
2641
2642                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2643                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2644                                 draining = true;
2645
2646                         goto again;
2647                 }
2648         } else {
2649                 goto again;
2650         }
2651
2652 out_disable:
2653         thread__zput(trace->current);
2654
2655         perf_evlist__disable(evlist);
2656
2657         if (!err) {
2658                 if (trace->summary)
2659                         trace__fprintf_thread_summary(trace, trace->output);
2660
2661                 if (trace->show_tool_stats) {
2662                         fprintf(trace->output, "Stats:\n "
2663                                                " vfs_getname : %" PRIu64 "\n"
2664                                                " proc_getname: %" PRIu64 "\n",
2665                                 trace->stats.vfs_getname,
2666                                 trace->stats.proc_getname);
2667                 }
2668         }
2669
2670 out_delete_evlist:
2671         trace__symbols__exit(trace);
2672
2673         perf_evlist__delete(evlist);
2674         cgroup__put(trace->cgroup);
2675         trace->evlist = NULL;
2676         trace->live = false;
2677         return err;
2678 {
2679         char errbuf[BUFSIZ];
2680
2681 out_error_sched_stat_runtime:
2682         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2683         goto out_error;
2684
2685 out_error_raw_syscalls:
2686         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2687         goto out_error;
2688
2689 out_error_mmap:
2690         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2691         goto out_error;
2692
2693 out_error_open:
2694         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2695
2696 out_error:
2697         fprintf(trace->output, "%s\n", errbuf);
2698         goto out_delete_evlist;
2699
2700 out_error_apply_filters:
2701         fprintf(trace->output,
2702                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2703                 evsel->filter, perf_evsel__name(evsel), errno,
2704                 str_error_r(errno, errbuf, sizeof(errbuf)));
2705         goto out_delete_evlist;
2706 }
2707 out_error_mem:
2708         fprintf(trace->output, "Not enough memory to run!\n");
2709         goto out_delete_evlist;
2710
2711 out_errno:
2712         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2713         goto out_delete_evlist;
2714 }
2715
2716 static int trace__replay(struct trace *trace)
2717 {
2718         const struct perf_evsel_str_handler handlers[] = {
2719                 { "probe:vfs_getname",       trace__vfs_getname, },
2720         };
2721         struct perf_data data = {
2722                 .file      = {
2723                         .path = input_name,
2724                 },
2725                 .mode      = PERF_DATA_MODE_READ,
2726                 .force     = trace->force,
2727         };
2728         struct perf_session *session;
2729         struct perf_evsel *evsel;
2730         int err = -1;
2731
2732         trace->tool.sample        = trace__process_sample;
2733         trace->tool.mmap          = perf_event__process_mmap;
2734         trace->tool.mmap2         = perf_event__process_mmap2;
2735         trace->tool.comm          = perf_event__process_comm;
2736         trace->tool.exit          = perf_event__process_exit;
2737         trace->tool.fork          = perf_event__process_fork;
2738         trace->tool.attr          = perf_event__process_attr;
2739         trace->tool.tracing_data  = perf_event__process_tracing_data;
2740         trace->tool.build_id      = perf_event__process_build_id;
2741         trace->tool.namespaces    = perf_event__process_namespaces;
2742
2743         trace->tool.ordered_events = true;
2744         trace->tool.ordering_requires_timestamps = true;
2745
2746         /* add tid to output */
2747         trace->multiple_threads = true;
2748
2749         session = perf_session__new(&data, false, &trace->tool);
2750         if (session == NULL)
2751                 return -1;
2752
2753         if (trace->opts.target.pid)
2754                 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2755
2756         if (trace->opts.target.tid)
2757                 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2758
2759         if (symbol__init(&session->header.env) < 0)
2760                 goto out;
2761
2762         trace->host = &session->machines.host;
2763
2764         err = perf_session__set_tracepoints_handlers(session, handlers);
2765         if (err)
2766                 goto out;
2767
2768         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2769                                                      "raw_syscalls:sys_enter");
2770         /* older kernels have syscalls tp versus raw_syscalls */
2771         if (evsel == NULL)
2772                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2773                                                              "syscalls:sys_enter");
2774
2775         if (evsel &&
2776             (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 ||
2777             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2778                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2779                 goto out;
2780         }
2781
2782         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2783                                                      "raw_syscalls:sys_exit");
2784         if (evsel == NULL)
2785                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2786                                                              "syscalls:sys_exit");
2787         if (evsel &&
2788             (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 ||
2789             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2790                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2791                 goto out;
2792         }
2793
2794         evlist__for_each_entry(session->evlist, evsel) {
2795                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2796                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2797                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2798                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2799                         evsel->handler = trace__pgfault;
2800         }
2801
2802         setup_pager();
2803
2804         err = perf_session__process_events(session);
2805         if (err)
2806                 pr_err("Failed to process events, error %d", err);
2807
2808         else if (trace->summary)
2809                 trace__fprintf_thread_summary(trace, trace->output);
2810
2811 out:
2812         perf_session__delete(session);
2813
2814         return err;
2815 }
2816
2817 static size_t trace__fprintf_threads_header(FILE *fp)
2818 {
2819         size_t printed;
2820
2821         printed  = fprintf(fp, "\n Summary of events:\n\n");
2822
2823         return printed;
2824 }
2825
2826 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2827         struct stats    *stats;
2828         double          msecs;
2829         int             syscall;
2830 )
2831 {
2832         struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2833         struct stats *stats = source->priv;
2834
2835         entry->syscall = source->i;
2836         entry->stats   = stats;
2837         entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2838 }
2839
2840 static size_t thread__dump_stats(struct thread_trace *ttrace,
2841                                  struct trace *trace, FILE *fp)
2842 {
2843         size_t printed = 0;
2844         struct syscall *sc;
2845         struct rb_node *nd;
2846         DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2847
2848         if (syscall_stats == NULL)
2849                 return 0;
2850
2851         printed += fprintf(fp, "\n");
2852
2853         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2854         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2855         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2856
2857         resort_rb__for_each_entry(nd, syscall_stats) {
2858                 struct stats *stats = syscall_stats_entry->stats;
2859                 if (stats) {
2860                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2861                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2862                         double avg = avg_stats(stats);
2863                         double pct;
2864                         u64 n = (u64) stats->n;
2865
2866                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2867                         avg /= NSEC_PER_MSEC;
2868
2869                         sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2870                         printed += fprintf(fp, "   %-15s", sc->name);
2871                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2872                                            n, syscall_stats_entry->msecs, min, avg);
2873                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2874                 }
2875         }
2876
2877         resort_rb__delete(syscall_stats);
2878         printed += fprintf(fp, "\n\n");
2879
2880         return printed;
2881 }
2882
2883 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2884 {
2885         size_t printed = 0;
2886         struct thread_trace *ttrace = thread__priv(thread);
2887         double ratio;
2888
2889         if (ttrace == NULL)
2890                 return 0;
2891
2892         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2893
2894         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2895         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2896         printed += fprintf(fp, "%.1f%%", ratio);
2897         if (ttrace->pfmaj)
2898                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2899         if (ttrace->pfmin)
2900                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2901         if (trace->sched)
2902                 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2903         else if (fputc('\n', fp) != EOF)
2904                 ++printed;
2905
2906         printed += thread__dump_stats(ttrace, trace, fp);
2907
2908         return printed;
2909 }
2910
2911 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2912 {
2913         return ttrace ? ttrace->nr_events : 0;
2914 }
2915
2916 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2917         struct thread *thread;
2918 )
2919 {
2920         entry->thread = rb_entry(nd, struct thread, rb_node);
2921 }
2922
2923 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2924 {
2925         size_t printed = trace__fprintf_threads_header(fp);
2926         struct rb_node *nd;
2927         int i;
2928
2929         for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2930                 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2931
2932                 if (threads == NULL) {
2933                         fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2934                         return 0;
2935                 }
2936
2937                 resort_rb__for_each_entry(nd, threads)
2938                         printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2939
2940                 resort_rb__delete(threads);
2941         }
2942         return printed;
2943 }
2944
2945 static int trace__set_duration(const struct option *opt, const char *str,
2946                                int unset __maybe_unused)
2947 {
2948         struct trace *trace = opt->value;
2949
2950         trace->duration_filter = atof(str);
2951         return 0;
2952 }
2953
2954 static int trace__set_filter_pids(const struct option *opt, const char *str,
2955                                   int unset __maybe_unused)
2956 {
2957         int ret = -1;
2958         size_t i;
2959         struct trace *trace = opt->value;
2960         /*
2961          * FIXME: introduce a intarray class, plain parse csv and create a
2962          * { int nr, int entries[] } struct...
2963          */
2964         struct intlist *list = intlist__new(str);
2965
2966         if (list == NULL)
2967                 return -1;
2968
2969         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2970         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2971
2972         if (trace->filter_pids.entries == NULL)
2973                 goto out;
2974
2975         trace->filter_pids.entries[0] = getpid();
2976
2977         for (i = 1; i < trace->filter_pids.nr; ++i)
2978                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2979
2980         intlist__delete(list);
2981         ret = 0;
2982 out:
2983         return ret;
2984 }
2985
2986 static int trace__open_output(struct trace *trace, const char *filename)
2987 {
2988         struct stat st;
2989
2990         if (!stat(filename, &st) && st.st_size) {
2991                 char oldname[PATH_MAX];
2992
2993                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2994                 unlink(oldname);
2995                 rename(filename, oldname);
2996         }
2997
2998         trace->output = fopen(filename, "w");
2999
3000         return trace->output == NULL ? -errno : 0;
3001 }
3002
3003 static int parse_pagefaults(const struct option *opt, const char *str,
3004                             int unset __maybe_unused)
3005 {
3006         int *trace_pgfaults = opt->value;
3007
3008         if (strcmp(str, "all") == 0)
3009                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
3010         else if (strcmp(str, "maj") == 0)
3011                 *trace_pgfaults |= TRACE_PFMAJ;
3012         else if (strcmp(str, "min") == 0)
3013                 *trace_pgfaults |= TRACE_PFMIN;
3014         else
3015                 return -1;
3016
3017         return 0;
3018 }
3019
3020 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
3021 {
3022         struct perf_evsel *evsel;
3023
3024         evlist__for_each_entry(evlist, evsel)
3025                 evsel->handler = handler;
3026 }
3027
3028 static int evlist__set_syscall_tp_fields(struct perf_evlist *evlist)
3029 {
3030         struct perf_evsel *evsel;
3031
3032         evlist__for_each_entry(evlist, evsel) {
3033                 if (evsel->priv || !evsel->tp_format)
3034                         continue;
3035
3036                 if (strcmp(evsel->tp_format->system, "syscalls"))
3037                         continue;
3038
3039                 if (perf_evsel__init_syscall_tp(evsel))
3040                         return -1;
3041
3042                 if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) {
3043                         struct syscall_tp *sc = evsel->priv;
3044
3045                         if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)))
3046                                 return -1;
3047                 } else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) {
3048                         struct syscall_tp *sc = evsel->priv;
3049
3050                         if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap))
3051                                 return -1;
3052                 }
3053         }
3054
3055         return 0;
3056 }
3057
3058 /*
3059  * XXX: Hackish, just splitting the combined -e+--event (syscalls
3060  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
3061  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
3062  *
3063  * It'd be better to introduce a parse_options() variant that would return a
3064  * list with the terms it didn't match to an event...
3065  */
3066 static int trace__parse_events_option(const struct option *opt, const char *str,
3067                                       int unset __maybe_unused)
3068 {
3069         struct trace *trace = (struct trace *)opt->value;
3070         const char *s = str;
3071         char *sep = NULL, *lists[2] = { NULL, NULL, };
3072         int len = strlen(str) + 1, err = -1, list, idx;
3073         char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
3074         char group_name[PATH_MAX];
3075
3076         if (strace_groups_dir == NULL)
3077                 return -1;
3078
3079         if (*s == '!') {
3080                 ++s;
3081                 trace->not_ev_qualifier = true;
3082         }
3083
3084         while (1) {
3085                 if ((sep = strchr(s, ',')) != NULL)
3086                         *sep = '\0';
3087
3088                 list = 0;
3089                 if (syscalltbl__id(trace->sctbl, s) >= 0 ||
3090                     syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
3091                         list = 1;
3092                 } else {
3093                         path__join(group_name, sizeof(group_name), strace_groups_dir, s);
3094                         if (access(group_name, R_OK) == 0)
3095                                 list = 1;
3096                 }
3097
3098                 if (lists[list]) {
3099                         sprintf(lists[list] + strlen(lists[list]), ",%s", s);
3100                 } else {
3101                         lists[list] = malloc(len);
3102                         if (lists[list] == NULL)
3103                                 goto out;
3104                         strcpy(lists[list], s);
3105                 }
3106
3107                 if (!sep)
3108                         break;
3109
3110                 *sep = ',';
3111                 s = sep + 1;
3112         }
3113
3114         if (lists[1] != NULL) {
3115                 struct strlist_config slist_config = {
3116                         .dirname = strace_groups_dir,
3117                 };
3118
3119                 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
3120                 if (trace->ev_qualifier == NULL) {
3121                         fputs("Not enough memory to parse event qualifier", trace->output);
3122                         goto out;
3123                 }
3124
3125                 if (trace__validate_ev_qualifier(trace))
3126                         goto out;
3127                 trace->trace_syscalls = true;
3128         }
3129
3130         err = 0;
3131
3132         if (lists[0]) {
3133                 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3134                                                "event selector. use 'perf list' to list available events",
3135                                                parse_events_option);
3136                 err = parse_events_option(&o, lists[0], 0);
3137         }
3138 out:
3139         if (sep)
3140                 *sep = ',';
3141
3142         return err;
3143 }
3144
3145 static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3146 {
3147         struct trace *trace = opt->value;
3148
3149         if (!list_empty(&trace->evlist->entries))
3150                 return parse_cgroups(opt, str, unset);
3151
3152         trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3153
3154         return 0;
3155 }
3156
3157 int cmd_trace(int argc, const char **argv)
3158 {
3159         const char *trace_usage[] = {
3160                 "perf trace [<options>] [<command>]",
3161                 "perf trace [<options>] -- <command> [<options>]",
3162                 "perf trace record [<options>] [<command>]",
3163                 "perf trace record [<options>] -- <command> [<options>]",
3164                 NULL
3165         };
3166         struct trace trace = {
3167                 .syscalls = {
3168                         . max = -1,
3169                 },
3170                 .opts = {
3171                         .target = {
3172                                 .uid       = UINT_MAX,
3173                                 .uses_mmap = true,
3174                         },
3175                         .user_freq     = UINT_MAX,
3176                         .user_interval = ULLONG_MAX,
3177                         .no_buffering  = true,
3178                         .mmap_pages    = UINT_MAX,
3179                         .proc_map_timeout  = 500,
3180                 },
3181                 .output = stderr,
3182                 .show_comm = true,
3183                 .trace_syscalls = false,
3184                 .kernel_syscallchains = false,
3185                 .max_stack = UINT_MAX,
3186         };
3187         const char *output_name = NULL;
3188         const struct option trace_options[] = {
3189         OPT_CALLBACK('e', "event", &trace, "event",
3190                      "event/syscall selector. use 'perf list' to list available events",
3191                      trace__parse_events_option),
3192         OPT_BOOLEAN(0, "comm", &trace.show_comm,
3193                     "show the thread COMM next to its id"),
3194         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3195         OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3196                      trace__parse_events_option),
3197         OPT_STRING('o', "output", &output_name, "file", "output file name"),
3198         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3199         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3200                     "trace events on existing process id"),
3201         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3202                     "trace events on existing thread id"),
3203         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3204                      "pids to filter (by the kernel)", trace__set_filter_pids),
3205         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3206                     "system-wide collection from all CPUs"),
3207         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3208                     "list of cpus to monitor"),
3209         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3210                     "child tasks do not inherit counters"),
3211         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3212                      "number of mmap data pages",
3213                      perf_evlist__parse_mmap_pages),
3214         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3215                    "user to profile"),
3216         OPT_CALLBACK(0, "duration", &trace, "float",
3217                      "show only events with duration > N.M ms",
3218                      trace__set_duration),
3219         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3220         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3221         OPT_BOOLEAN('T', "time", &trace.full_time,
3222                     "Show full timestamp, not time relative to first start"),
3223         OPT_BOOLEAN(0, "failure", &trace.failure_only,
3224                     "Show only syscalls that failed"),
3225         OPT_BOOLEAN('s', "summary", &trace.summary_only,
3226                     "Show only syscall summary with statistics"),
3227         OPT_BOOLEAN('S', "with-summary", &trace.summary,
3228                     "Show all syscalls and summary with statistics"),
3229         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3230                      "Trace pagefaults", parse_pagefaults, "maj"),
3231         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3232         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3233         OPT_CALLBACK(0, "call-graph", &trace.opts,
3234                      "record_mode[,record_size]", record_callchain_help,
3235                      &record_parse_callchain_opt),
3236         OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3237                     "Show the kernel callchains on the syscall exit path"),
3238         OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3239                      "Set the minimum stack depth when parsing the callchain, "
3240                      "anything below the specified depth will be ignored."),
3241         OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3242                      "Set the maximum stack depth when parsing the callchain, "
3243                      "anything beyond the specified depth will be ignored. "
3244                      "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3245         OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3246                         "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3247         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3248                         "per thread proc mmap processing timeout in ms"),
3249         OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3250                      trace__parse_cgroups),
3251         OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3252                      "ms to wait before starting measurement after program "
3253                      "start"),
3254         OPT_END()
3255         };
3256         bool __maybe_unused max_stack_user_set = true;
3257         bool mmap_pages_user_set = true;
3258         struct perf_evsel *evsel;
3259         const char * const trace_subcommands[] = { "record", NULL };
3260         int err = -1;
3261         char bf[BUFSIZ];
3262
3263         signal(SIGSEGV, sighandler_dump_stack);
3264         signal(SIGFPE, sighandler_dump_stack);
3265
3266         trace.evlist = perf_evlist__new();
3267         trace.sctbl = syscalltbl__new();
3268
3269         if (trace.evlist == NULL || trace.sctbl == NULL) {
3270                 pr_err("Not enough memory to run!\n");
3271                 err = -ENOMEM;
3272                 goto out;
3273         }
3274
3275         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3276                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3277
3278         if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3279                 usage_with_options_msg(trace_usage, trace_options,
3280                                        "cgroup monitoring only available in system-wide mode");
3281         }
3282
3283         evsel = bpf__setup_output_event(trace.evlist, "__augmented_syscalls__");
3284         if (IS_ERR(evsel)) {
3285                 bpf__strerror_setup_output_event(trace.evlist, PTR_ERR(evsel), bf, sizeof(bf));
3286                 pr_err("ERROR: Setup trace syscalls enter failed: %s\n", bf);
3287                 goto out;
3288         }
3289
3290         if (evsel) {
3291                 if (perf_evsel__init_augmented_syscall_tp(evsel) ||
3292                     perf_evsel__init_augmented_syscall_tp_args(evsel))
3293                         goto out;
3294                 trace.syscalls.events.augmented = evsel;
3295         }
3296
3297         err = bpf__setup_stdout(trace.evlist);
3298         if (err) {
3299                 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3300                 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3301                 goto out;
3302         }
3303
3304         err = -1;
3305
3306         if (trace.trace_pgfaults) {
3307                 trace.opts.sample_address = true;
3308                 trace.opts.sample_time = true;
3309         }
3310
3311         if (trace.opts.mmap_pages == UINT_MAX)
3312                 mmap_pages_user_set = false;
3313
3314         if (trace.max_stack == UINT_MAX) {
3315                 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
3316                 max_stack_user_set = false;
3317         }
3318
3319 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3320         if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3321                 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3322         }
3323 #endif
3324
3325         if (callchain_param.enabled) {
3326                 if (!mmap_pages_user_set && geteuid() == 0)
3327                         trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3328
3329                 symbol_conf.use_callchain = true;
3330         }
3331
3332         if (trace.evlist->nr_entries > 0) {
3333                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3334                 if (evlist__set_syscall_tp_fields(trace.evlist)) {
3335                         perror("failed to set syscalls:* tracepoint fields");
3336                         goto out;
3337                 }
3338         }
3339
3340         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3341                 return trace__record(&trace, argc-1, &argv[1]);
3342
3343         /* summary_only implies summary option, but don't overwrite summary if set */
3344         if (trace.summary_only)
3345                 trace.summary = trace.summary_only;
3346
3347         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3348             trace.evlist->nr_entries == 0 /* Was --events used? */) {
3349                 trace.trace_syscalls = true;
3350         }
3351
3352         if (output_name != NULL) {
3353                 err = trace__open_output(&trace, output_name);
3354                 if (err < 0) {
3355                         perror("failed to create output file");
3356                         goto out;
3357                 }
3358         }
3359
3360         err = target__validate(&trace.opts.target);
3361         if (err) {
3362                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3363                 fprintf(trace.output, "%s", bf);
3364                 goto out_close;
3365         }
3366
3367         err = target__parse_uid(&trace.opts.target);
3368         if (err) {
3369                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3370                 fprintf(trace.output, "%s", bf);
3371                 goto out_close;
3372         }
3373
3374         if (!argc && target__none(&trace.opts.target))
3375                 trace.opts.target.system_wide = true;
3376
3377         if (input_name)
3378                 err = trace__replay(&trace);
3379         else
3380                 err = trace__run(&trace, argc, argv);
3381
3382 out_close:
3383         if (output_name != NULL)
3384                 fclose(trace.output);
3385 out:
3386         return err;
3387 }