GNU Linux-libre 4.14.303-gnu1
[releases.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/event.h"
25 #include "util/evlist.h"
26 #include <subcmd/exec-cmd.h>
27 #include "util/machine.h"
28 #include "util/path.h"
29 #include "util/session.h"
30 #include "util/thread.h"
31 #include <subcmd/parse-options.h>
32 #include "util/strlist.h"
33 #include "util/intlist.h"
34 #include "util/thread_map.h"
35 #include "util/stat.h"
36 #include "trace/beauty/beauty.h"
37 #include "trace-event.h"
38 #include "util/parse-events.h"
39 #include "util/bpf-loader.h"
40 #include "callchain.h"
41 #include "print_binary.h"
42 #include "string2.h"
43 #include "syscalltbl.h"
44 #include "rb_resort.h"
45
46 #include <errno.h>
47 #include <inttypes.h>
48 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
49 #include <poll.h>
50 #include <signal.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <linux/err.h>
54 #include <linux/filter.h>
55 #include <linux/audit.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60
61 #include "sane_ctype.h"
62
63 #ifndef O_CLOEXEC
64 # define O_CLOEXEC              02000000
65 #endif
66
67 #ifndef F_LINUX_SPECIFIC_BASE
68 # define F_LINUX_SPECIFIC_BASE  1024
69 #endif
70
71 struct trace {
72         struct perf_tool        tool;
73         struct syscalltbl       *sctbl;
74         struct {
75                 int             max;
76                 struct syscall  *table;
77                 struct {
78                         struct perf_evsel *sys_enter,
79                                           *sys_exit;
80                 }               events;
81         } syscalls;
82         struct record_opts      opts;
83         struct perf_evlist      *evlist;
84         struct machine          *host;
85         struct thread           *current;
86         u64                     base_time;
87         FILE                    *output;
88         unsigned long           nr_events;
89         struct strlist          *ev_qualifier;
90         struct {
91                 size_t          nr;
92                 int             *entries;
93         }                       ev_qualifier_ids;
94         struct {
95                 size_t          nr;
96                 pid_t           *entries;
97         }                       filter_pids;
98         double                  duration_filter;
99         double                  runtime_ms;
100         struct {
101                 u64             vfs_getname,
102                                 proc_getname;
103         } stats;
104         unsigned int            max_stack;
105         unsigned int            min_stack;
106         bool                    not_ev_qualifier;
107         bool                    live;
108         bool                    full_time;
109         bool                    sched;
110         bool                    multiple_threads;
111         bool                    summary;
112         bool                    summary_only;
113         bool                    show_comm;
114         bool                    show_tool_stats;
115         bool                    trace_syscalls;
116         bool                    kernel_syscallchains;
117         bool                    force;
118         bool                    vfs_getname;
119         int                     trace_pgfaults;
120         int                     open_id;
121 };
122
123 struct tp_field {
124         int offset;
125         union {
126                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
127                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
128         };
129 };
130
131 #define TP_UINT_FIELD(bits) \
132 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
133 { \
134         u##bits value; \
135         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
136         return value;  \
137 }
138
139 TP_UINT_FIELD(8);
140 TP_UINT_FIELD(16);
141 TP_UINT_FIELD(32);
142 TP_UINT_FIELD(64);
143
144 #define TP_UINT_FIELD__SWAPPED(bits) \
145 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
146 { \
147         u##bits value; \
148         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
149         return bswap_##bits(value);\
150 }
151
152 TP_UINT_FIELD__SWAPPED(16);
153 TP_UINT_FIELD__SWAPPED(32);
154 TP_UINT_FIELD__SWAPPED(64);
155
156 static int tp_field__init_uint(struct tp_field *field,
157                                struct format_field *format_field,
158                                bool needs_swap)
159 {
160         field->offset = format_field->offset;
161
162         switch (format_field->size) {
163         case 1:
164                 field->integer = tp_field__u8;
165                 break;
166         case 2:
167                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
168                 break;
169         case 4:
170                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
171                 break;
172         case 8:
173                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
174                 break;
175         default:
176                 return -1;
177         }
178
179         return 0;
180 }
181
182 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
183 {
184         return sample->raw_data + field->offset;
185 }
186
187 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
188 {
189         field->offset = format_field->offset;
190         field->pointer = tp_field__ptr;
191         return 0;
192 }
193
194 struct syscall_tp {
195         struct tp_field id;
196         union {
197                 struct tp_field args, ret;
198         };
199 };
200
201 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
202                                           struct tp_field *field,
203                                           const char *name)
204 {
205         struct format_field *format_field = perf_evsel__field(evsel, name);
206
207         if (format_field == NULL)
208                 return -1;
209
210         return tp_field__init_uint(field, format_field, evsel->needs_swap);
211 }
212
213 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
214         ({ struct syscall_tp *sc = evsel->priv;\
215            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
216
217 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
218                                          struct tp_field *field,
219                                          const char *name)
220 {
221         struct format_field *format_field = perf_evsel__field(evsel, name);
222
223         if (format_field == NULL)
224                 return -1;
225
226         return tp_field__init_ptr(field, format_field);
227 }
228
229 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
230         ({ struct syscall_tp *sc = evsel->priv;\
231            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
232
233 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
234 {
235         zfree(&evsel->priv);
236         perf_evsel__delete(evsel);
237 }
238
239 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
240 {
241         evsel->priv = malloc(sizeof(struct syscall_tp));
242         if (evsel->priv != NULL) {
243                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
244                         goto out_delete;
245
246                 evsel->handler = handler;
247                 return 0;
248         }
249
250         return -ENOMEM;
251
252 out_delete:
253         zfree(&evsel->priv);
254         return -ENOENT;
255 }
256
257 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
258 {
259         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
260
261         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
262         if (IS_ERR(evsel))
263                 evsel = perf_evsel__newtp("syscalls", direction);
264
265         if (IS_ERR(evsel))
266                 return NULL;
267
268         if (perf_evsel__init_syscall_tp(evsel, handler))
269                 goto out_delete;
270
271         return evsel;
272
273 out_delete:
274         perf_evsel__delete_priv(evsel);
275         return NULL;
276 }
277
278 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
279         ({ struct syscall_tp *fields = evsel->priv; \
280            fields->name.integer(&fields->name, sample); })
281
282 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
283         ({ struct syscall_tp *fields = evsel->priv; \
284            fields->name.pointer(&fields->name, sample); })
285
286 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
287 {
288         int idx = val - sa->offset;
289
290         if (idx < 0 || idx >= sa->nr_entries)
291                 return scnprintf(bf, size, intfmt, val);
292
293         return scnprintf(bf, size, "%s", sa->entries[idx]);
294 }
295
296 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
297                                                 const char *intfmt,
298                                                 struct syscall_arg *arg)
299 {
300         return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
301 }
302
303 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
304                                               struct syscall_arg *arg)
305 {
306         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
307 }
308
309 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
310
311 struct strarrays {
312         int             nr_entries;
313         struct strarray **entries;
314 };
315
316 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
317         .nr_entries = ARRAY_SIZE(array), \
318         .entries = array, \
319 }
320
321 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
322                                         struct syscall_arg *arg)
323 {
324         struct strarrays *sas = arg->parm;
325         int i;
326
327         for (i = 0; i < sas->nr_entries; ++i) {
328                 struct strarray *sa = sas->entries[i];
329                 int idx = arg->val - sa->offset;
330
331                 if (idx >= 0 && idx < sa->nr_entries) {
332                         if (sa->entries[idx] == NULL)
333                                 break;
334                         return scnprintf(bf, size, "%s", sa->entries[idx]);
335                 }
336         }
337
338         return scnprintf(bf, size, "%d", arg->val);
339 }
340
341 #ifndef AT_FDCWD
342 #define AT_FDCWD        -100
343 #endif
344
345 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
346                                            struct syscall_arg *arg)
347 {
348         int fd = arg->val;
349
350         if (fd == AT_FDCWD)
351                 return scnprintf(bf, size, "CWD");
352
353         return syscall_arg__scnprintf_fd(bf, size, arg);
354 }
355
356 #define SCA_FDAT syscall_arg__scnprintf_fd_at
357
358 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
359                                               struct syscall_arg *arg);
360
361 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
362
363 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
364 {
365         return scnprintf(bf, size, "%#lx", arg->val);
366 }
367
368 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
369 {
370         return scnprintf(bf, size, "%d", arg->val);
371 }
372
373 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
374 {
375         return scnprintf(bf, size, "%ld", arg->val);
376 }
377
378 static const char *bpf_cmd[] = {
379         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
380         "MAP_GET_NEXT_KEY", "PROG_LOAD",
381 };
382 static DEFINE_STRARRAY(bpf_cmd);
383
384 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
385 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
386
387 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
388 static DEFINE_STRARRAY(itimers);
389
390 static const char *keyctl_options[] = {
391         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
392         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
393         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
394         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
395         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
396 };
397 static DEFINE_STRARRAY(keyctl_options);
398
399 static const char *whences[] = { "SET", "CUR", "END",
400 #ifdef SEEK_DATA
401 "DATA",
402 #endif
403 #ifdef SEEK_HOLE
404 "HOLE",
405 #endif
406 };
407 static DEFINE_STRARRAY(whences);
408
409 static const char *fcntl_cmds[] = {
410         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
411         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
412         "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
413         "GETOWNER_UIDS",
414 };
415 static DEFINE_STRARRAY(fcntl_cmds);
416
417 static const char *fcntl_linux_specific_cmds[] = {
418         "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
419         "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
420         "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
421 };
422
423 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
424
425 static struct strarray *fcntl_cmds_arrays[] = {
426         &strarray__fcntl_cmds,
427         &strarray__fcntl_linux_specific_cmds,
428 };
429
430 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
431
432 static const char *rlimit_resources[] = {
433         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
434         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
435         "RTTIME",
436 };
437 static DEFINE_STRARRAY(rlimit_resources);
438
439 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
440 static DEFINE_STRARRAY(sighow);
441
442 static const char *clockid[] = {
443         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
444         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
445         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
446 };
447 static DEFINE_STRARRAY(clockid);
448
449 static const char *socket_families[] = {
450         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
451         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
452         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
453         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
454         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
455         "ALG", "NFC", "VSOCK",
456 };
457 static DEFINE_STRARRAY(socket_families);
458
459 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
460                                                  struct syscall_arg *arg)
461 {
462         size_t printed = 0;
463         int mode = arg->val;
464
465         if (mode == F_OK) /* 0 */
466                 return scnprintf(bf, size, "F");
467 #define P_MODE(n) \
468         if (mode & n##_OK) { \
469                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
470                 mode &= ~n##_OK; \
471         }
472
473         P_MODE(R);
474         P_MODE(W);
475         P_MODE(X);
476 #undef P_MODE
477
478         if (mode)
479                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
480
481         return printed;
482 }
483
484 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
485
486 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
487                                               struct syscall_arg *arg);
488
489 #define SCA_FILENAME syscall_arg__scnprintf_filename
490
491 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
492                                                 struct syscall_arg *arg)
493 {
494         int printed = 0, flags = arg->val;
495
496 #define P_FLAG(n) \
497         if (flags & O_##n) { \
498                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
499                 flags &= ~O_##n; \
500         }
501
502         P_FLAG(CLOEXEC);
503         P_FLAG(NONBLOCK);
504 #undef P_FLAG
505
506         if (flags)
507                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
508
509         return printed;
510 }
511
512 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
513
514 #ifndef GRND_NONBLOCK
515 #define GRND_NONBLOCK   0x0001
516 #endif
517 #ifndef GRND_RANDOM
518 #define GRND_RANDOM     0x0002
519 #endif
520
521 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
522                                                    struct syscall_arg *arg)
523 {
524         int printed = 0, flags = arg->val;
525
526 #define P_FLAG(n) \
527         if (flags & GRND_##n) { \
528                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
529                 flags &= ~GRND_##n; \
530         }
531
532         P_FLAG(RANDOM);
533         P_FLAG(NONBLOCK);
534 #undef P_FLAG
535
536         if (flags)
537                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
538
539         return printed;
540 }
541
542 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
543
544 #define STRARRAY(name, array) \
545           { .scnprintf  = SCA_STRARRAY, \
546             .parm       = &strarray__##array, }
547
548 #include "trace/beauty/eventfd.c"
549 #include "trace/beauty/flock.c"
550 #include "trace/beauty/futex_op.c"
551 #include "trace/beauty/mmap.c"
552 #include "trace/beauty/mode_t.c"
553 #include "trace/beauty/msg_flags.c"
554 #include "trace/beauty/open_flags.c"
555 #include "trace/beauty/perf_event_open.c"
556 #include "trace/beauty/pid.c"
557 #include "trace/beauty/sched_policy.c"
558 #include "trace/beauty/seccomp.c"
559 #include "trace/beauty/signum.c"
560 #include "trace/beauty/socket_type.c"
561 #include "trace/beauty/waitid_options.c"
562
563 struct syscall_arg_fmt {
564         size_t     (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
565         void       *parm;
566         const char *name;
567         bool       show_zero;
568 };
569
570 static struct syscall_fmt {
571         const char *name;
572         const char *alias;
573         struct syscall_arg_fmt arg[6];
574         u8         nr_args;
575         bool       errpid;
576         bool       timeout;
577         bool       hexret;
578 } syscall_fmts[] = {
579         { .name     = "access",
580           .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
581         { .name     = "arch_prctl", .alias = "prctl", },
582         { .name     = "bpf",
583           .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
584         { .name     = "brk",        .hexret = true,
585           .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
586         { .name     = "clock_gettime",
587           .arg = { [0] = STRARRAY(clk_id, clockid), }, },
588         { .name     = "clone",      .errpid = true, .nr_args = 5,
589           .arg = { [0] = { .name = "flags",         .scnprintf = SCA_CLONE_FLAGS, },
590                    [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
591                    [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
592                    [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
593                    [4] = { .name = "tls",           .scnprintf = SCA_HEX, }, }, },
594         { .name     = "close",
595           .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
596         { .name     = "epoll_ctl",
597           .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
598         { .name     = "eventfd2",
599           .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
600         { .name     = "fchmodat",
601           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
602         { .name     = "fchownat",
603           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
604         { .name     = "fcntl",
605           .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
606                            .parm      = &strarrays__fcntl_cmds_arrays,
607                            .show_zero = true, },
608                    [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
609         { .name     = "flock",
610           .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
611         { .name     = "fstat", .alias = "newfstat", },
612         { .name     = "fstatat", .alias = "newfstatat", },
613         { .name     = "futex",
614           .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ }, }, },
615         { .name     = "futimesat",
616           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
617         { .name     = "getitimer",
618           .arg = { [0] = STRARRAY(which, itimers), }, },
619         { .name     = "getpid",     .errpid = true, },
620         { .name     = "getpgid",    .errpid = true, },
621         { .name     = "getppid",    .errpid = true, },
622         { .name     = "getrandom",
623           .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
624         { .name     = "getrlimit",
625           .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
626         { .name     = "ioctl",
627           .arg = {
628 #if defined(__i386__) || defined(__x86_64__)
629 /*
630  * FIXME: Make this available to all arches.
631  */
632                    [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
633                    [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
634 #else
635                    [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
636 #endif
637         { .name     = "keyctl",
638           .arg = { [0] = STRARRAY(option, keyctl_options), }, },
639         { .name     = "kill",
640           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
641         { .name     = "linkat",
642           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
643         { .name     = "lseek",
644           .arg = { [2] = STRARRAY(whence, whences), }, },
645         { .name     = "lstat", .alias = "newlstat", },
646         { .name     = "madvise",
647           .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
648                    [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
649         { .name     = "mkdirat",
650           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
651         { .name     = "mknodat",
652           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
653         { .name     = "mlock",
654           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
655         { .name     = "mlockall",
656           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
657         { .name     = "mmap",       .hexret = true,
658 /* The standard mmap maps to old_mmap on s390x */
659 #if defined(__s390x__)
660         .alias = "old_mmap",
661 #endif
662           .arg = { [0] = { .scnprintf = SCA_HEX,        /* addr */ },
663                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
664                    [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, },
665         { .name     = "mprotect",
666           .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
667                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ }, }, },
668         { .name     = "mq_unlink",
669           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
670         { .name     = "mremap",     .hexret = true,
671           .arg = { [0] = { .scnprintf = SCA_HEX,          /* addr */ },
672                    [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
673                    [4] = { .scnprintf = SCA_HEX,          /* new_addr */ }, }, },
674         { .name     = "munlock",
675           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
676         { .name     = "munmap",
677           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
678         { .name     = "name_to_handle_at",
679           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
680         { .name     = "newfstatat",
681           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
682         { .name     = "open",
683           .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
684         { .name     = "open_by_handle_at",
685           .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
686                    [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
687         { .name     = "openat",
688           .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
689                    [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
690         { .name     = "perf_event_open",
691           .arg = { [2] = { .scnprintf = SCA_INT,        /* cpu */ },
692                    [3] = { .scnprintf = SCA_FD,         /* group_fd */ },
693                    [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
694         { .name     = "pipe2",
695           .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
696         { .name     = "pkey_alloc",
697           .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,   /* access_rights */ }, }, },
698         { .name     = "pkey_free",
699           .arg = { [0] = { .scnprintf = SCA_INT,        /* key */ }, }, },
700         { .name     = "pkey_mprotect",
701           .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
702                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
703                    [3] = { .scnprintf = SCA_INT,        /* pkey */ }, }, },
704         { .name     = "poll", .timeout = true, },
705         { .name     = "ppoll", .timeout = true, },
706         { .name     = "pread", .alias = "pread64", },
707         { .name     = "preadv", .alias = "pread", },
708         { .name     = "prlimit64",
709           .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
710         { .name     = "pwrite", .alias = "pwrite64", },
711         { .name     = "readlinkat",
712           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
713         { .name     = "recvfrom",
714           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
715         { .name     = "recvmmsg",
716           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
717         { .name     = "recvmsg",
718           .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
719         { .name     = "renameat",
720           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
721         { .name     = "rt_sigaction",
722           .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
723         { .name     = "rt_sigprocmask",
724           .arg = { [0] = STRARRAY(how, sighow), }, },
725         { .name     = "rt_sigqueueinfo",
726           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
727         { .name     = "rt_tgsigqueueinfo",
728           .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
729         { .name     = "sched_setscheduler",
730           .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
731         { .name     = "seccomp",
732           .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,    /* op */ },
733                    [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
734         { .name     = "select", .timeout = true, },
735         { .name     = "sendmmsg",
736           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
737         { .name     = "sendmsg",
738           .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
739         { .name     = "sendto",
740           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
741         { .name     = "set_tid_address", .errpid = true, },
742         { .name     = "setitimer",
743           .arg = { [0] = STRARRAY(which, itimers), }, },
744         { .name     = "setrlimit",
745           .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
746         { .name     = "socket",
747           .arg = { [0] = STRARRAY(family, socket_families),
748                    [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
749         { .name     = "socketpair",
750           .arg = { [0] = STRARRAY(family, socket_families),
751                    [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
752         { .name     = "stat", .alias = "newstat", },
753         { .name     = "statx",
754           .arg = { [0] = { .scnprintf = SCA_FDAT,        /* fdat */ },
755                    [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
756                    [3] = { .scnprintf = SCA_STATX_MASK,  /* mask */ }, }, },
757         { .name     = "swapoff",
758           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
759         { .name     = "swapon",
760           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
761         { .name     = "symlinkat",
762           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
763         { .name     = "tgkill",
764           .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
765         { .name     = "tkill",
766           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
767         { .name     = "uname", .alias = "newuname", },
768         { .name     = "unlinkat",
769           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
770         { .name     = "utimensat",
771           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
772         { .name     = "wait4",      .errpid = true,
773           .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
774         { .name     = "waitid",     .errpid = true,
775           .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
776 };
777
778 static int syscall_fmt__cmp(const void *name, const void *fmtp)
779 {
780         const struct syscall_fmt *fmt = fmtp;
781         return strcmp(name, fmt->name);
782 }
783
784 static struct syscall_fmt *syscall_fmt__find(const char *name)
785 {
786         const int nmemb = ARRAY_SIZE(syscall_fmts);
787         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
788 }
789
790 struct syscall {
791         struct event_format *tp_format;
792         int                 nr_args;
793         struct format_field *args;
794         const char          *name;
795         bool                is_exit;
796         struct syscall_fmt  *fmt;
797         struct syscall_arg_fmt *arg_fmt;
798 };
799
800 /*
801  * We need to have this 'calculated' boolean because in some cases we really
802  * don't know what is the duration of a syscall, for instance, when we start
803  * a session and some threads are waiting for a syscall to finish, say 'poll',
804  * in which case all we can do is to print "( ? ) for duration and for the
805  * start timestamp.
806  */
807 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
808 {
809         double duration = (double)t / NSEC_PER_MSEC;
810         size_t printed = fprintf(fp, "(");
811
812         if (!calculated)
813                 printed += fprintf(fp, "     ?   ");
814         else if (duration >= 1.0)
815                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
816         else if (duration >= 0.01)
817                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
818         else
819                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
820         return printed + fprintf(fp, "): ");
821 }
822
823 /**
824  * filename.ptr: The filename char pointer that will be vfs_getname'd
825  * filename.entry_str_pos: Where to insert the string translated from
826  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
827  * ret_scnprintf: syscall args may set this to a different syscall return
828  *                formatter, for instance, fcntl may return fds, file flags, etc.
829  */
830 struct thread_trace {
831         u64               entry_time;
832         bool              entry_pending;
833         unsigned long     nr_events;
834         unsigned long     pfmaj, pfmin;
835         char              *entry_str;
836         double            runtime_ms;
837         size_t            (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
838         struct {
839                 unsigned long ptr;
840                 short int     entry_str_pos;
841                 bool          pending_open;
842                 unsigned int  namelen;
843                 char          *name;
844         } filename;
845         struct {
846                 int       max;
847                 char      **table;
848         } paths;
849
850         struct intlist *syscall_stats;
851 };
852
853 static struct thread_trace *thread_trace__new(void)
854 {
855         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
856
857         if (ttrace)
858                 ttrace->paths.max = -1;
859
860         ttrace->syscall_stats = intlist__new(NULL);
861
862         return ttrace;
863 }
864
865 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
866 {
867         struct thread_trace *ttrace;
868
869         if (thread == NULL)
870                 goto fail;
871
872         if (thread__priv(thread) == NULL)
873                 thread__set_priv(thread, thread_trace__new());
874
875         if (thread__priv(thread) == NULL)
876                 goto fail;
877
878         ttrace = thread__priv(thread);
879         ++ttrace->nr_events;
880
881         return ttrace;
882 fail:
883         color_fprintf(fp, PERF_COLOR_RED,
884                       "WARNING: not enough memory, dropping samples!\n");
885         return NULL;
886 }
887
888
889 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
890                                     size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
891 {
892         struct thread_trace *ttrace = thread__priv(arg->thread);
893
894         ttrace->ret_scnprintf = ret_scnprintf;
895 }
896
897 #define TRACE_PFMAJ             (1 << 0)
898 #define TRACE_PFMIN             (1 << 1)
899
900 static const size_t trace__entry_str_size = 2048;
901
902 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
903 {
904         struct thread_trace *ttrace = thread__priv(thread);
905
906         if (fd > ttrace->paths.max) {
907                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
908
909                 if (npath == NULL)
910                         return -1;
911
912                 if (ttrace->paths.max != -1) {
913                         memset(npath + ttrace->paths.max + 1, 0,
914                                (fd - ttrace->paths.max) * sizeof(char *));
915                 } else {
916                         memset(npath, 0, (fd + 1) * sizeof(char *));
917                 }
918
919                 ttrace->paths.table = npath;
920                 ttrace->paths.max   = fd;
921         }
922
923         ttrace->paths.table[fd] = strdup(pathname);
924
925         return ttrace->paths.table[fd] != NULL ? 0 : -1;
926 }
927
928 static int thread__read_fd_path(struct thread *thread, int fd)
929 {
930         char linkname[PATH_MAX], pathname[PATH_MAX];
931         struct stat st;
932         int ret;
933
934         if (thread->pid_ == thread->tid) {
935                 scnprintf(linkname, sizeof(linkname),
936                           "/proc/%d/fd/%d", thread->pid_, fd);
937         } else {
938                 scnprintf(linkname, sizeof(linkname),
939                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
940         }
941
942         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
943                 return -1;
944
945         ret = readlink(linkname, pathname, sizeof(pathname));
946
947         if (ret < 0 || ret > st.st_size)
948                 return -1;
949
950         pathname[ret] = '\0';
951         return trace__set_fd_pathname(thread, fd, pathname);
952 }
953
954 static const char *thread__fd_path(struct thread *thread, int fd,
955                                    struct trace *trace)
956 {
957         struct thread_trace *ttrace = thread__priv(thread);
958
959         if (ttrace == NULL)
960                 return NULL;
961
962         if (fd < 0)
963                 return NULL;
964
965         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
966                 if (!trace->live)
967                         return NULL;
968                 ++trace->stats.proc_getname;
969                 if (thread__read_fd_path(thread, fd))
970                         return NULL;
971         }
972
973         return ttrace->paths.table[fd];
974 }
975
976 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
977 {
978         int fd = arg->val;
979         size_t printed = scnprintf(bf, size, "%d", fd);
980         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
981
982         if (path)
983                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
984
985         return printed;
986 }
987
988 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
989                                               struct syscall_arg *arg)
990 {
991         int fd = arg->val;
992         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
993         struct thread_trace *ttrace = thread__priv(arg->thread);
994
995         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
996                 zfree(&ttrace->paths.table[fd]);
997
998         return printed;
999 }
1000
1001 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1002                                      unsigned long ptr)
1003 {
1004         struct thread_trace *ttrace = thread__priv(thread);
1005
1006         ttrace->filename.ptr = ptr;
1007         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1008 }
1009
1010 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1011                                               struct syscall_arg *arg)
1012 {
1013         unsigned long ptr = arg->val;
1014
1015         if (!arg->trace->vfs_getname)
1016                 return scnprintf(bf, size, "%#x", ptr);
1017
1018         thread__set_filename_pos(arg->thread, bf, ptr);
1019         return 0;
1020 }
1021
1022 static bool trace__filter_duration(struct trace *trace, double t)
1023 {
1024         return t < (trace->duration_filter * NSEC_PER_MSEC);
1025 }
1026
1027 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1028 {
1029         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1030
1031         return fprintf(fp, "%10.3f ", ts);
1032 }
1033
1034 /*
1035  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1036  * using ttrace->entry_time for a thread that receives a sys_exit without
1037  * first having received a sys_enter ("poll" issued before tracing session
1038  * starts, lost sys_enter exit due to ring buffer overflow).
1039  */
1040 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1041 {
1042         if (tstamp > 0)
1043                 return __trace__fprintf_tstamp(trace, tstamp, fp);
1044
1045         return fprintf(fp, "         ? ");
1046 }
1047
1048 static bool done = false;
1049 static bool interrupted = false;
1050
1051 static void sig_handler(int sig)
1052 {
1053         done = true;
1054         interrupted = sig == SIGINT;
1055 }
1056
1057 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1058                                         u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1059 {
1060         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1061         printed += fprintf_duration(duration, duration_calculated, fp);
1062
1063         if (trace->multiple_threads) {
1064                 if (trace->show_comm)
1065                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1066                 printed += fprintf(fp, "%d ", thread->tid);
1067         }
1068
1069         return printed;
1070 }
1071
1072 static int trace__process_event(struct trace *trace, struct machine *machine,
1073                                 union perf_event *event, struct perf_sample *sample)
1074 {
1075         int ret = 0;
1076
1077         switch (event->header.type) {
1078         case PERF_RECORD_LOST:
1079                 color_fprintf(trace->output, PERF_COLOR_RED,
1080                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1081                 ret = machine__process_lost_event(machine, event, sample);
1082                 break;
1083         default:
1084                 ret = machine__process_event(machine, event, sample);
1085                 break;
1086         }
1087
1088         return ret;
1089 }
1090
1091 static int trace__tool_process(struct perf_tool *tool,
1092                                union perf_event *event,
1093                                struct perf_sample *sample,
1094                                struct machine *machine)
1095 {
1096         struct trace *trace = container_of(tool, struct trace, tool);
1097         return trace__process_event(trace, machine, event, sample);
1098 }
1099
1100 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1101 {
1102         struct machine *machine = vmachine;
1103
1104         if (machine->kptr_restrict_warned)
1105                 return NULL;
1106
1107         if (symbol_conf.kptr_restrict) {
1108                 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1109                            "Check /proc/sys/kernel/kptr_restrict.\n\n"
1110                            "Kernel samples will not be resolved.\n");
1111                 machine->kptr_restrict_warned = true;
1112                 return NULL;
1113         }
1114
1115         return machine__resolve_kernel_addr(vmachine, addrp, modp);
1116 }
1117
1118 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1119 {
1120         int err = symbol__init(NULL);
1121
1122         if (err)
1123                 return err;
1124
1125         trace->host = machine__new_host();
1126         if (trace->host == NULL)
1127                 return -ENOMEM;
1128
1129         if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1130                 return -errno;
1131
1132         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1133                                             evlist->threads, trace__tool_process, false,
1134                                             trace->opts.proc_map_timeout);
1135         if (err)
1136                 symbol__exit();
1137
1138         return err;
1139 }
1140
1141 static void trace__symbols__exit(struct trace *trace)
1142 {
1143         machine__exit(trace->host);
1144         trace->host = NULL;
1145
1146         symbol__exit();
1147 }
1148
1149 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1150 {
1151         int idx;
1152
1153         if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1154                 nr_args = sc->fmt->nr_args;
1155
1156         sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1157         if (sc->arg_fmt == NULL)
1158                 return -1;
1159
1160         for (idx = 0; idx < nr_args; ++idx) {
1161                 if (sc->fmt)
1162                         sc->arg_fmt[idx] = sc->fmt->arg[idx];
1163         }
1164
1165         sc->nr_args = nr_args;
1166         return 0;
1167 }
1168
1169 static int syscall__set_arg_fmts(struct syscall *sc)
1170 {
1171         struct format_field *field;
1172         int idx = 0, len;
1173
1174         for (field = sc->args; field; field = field->next, ++idx) {
1175                 if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1176                         continue;
1177
1178                 if (strcmp(field->type, "const char *") == 0 &&
1179                          (strcmp(field->name, "filename") == 0 ||
1180                           strcmp(field->name, "path") == 0 ||
1181                           strcmp(field->name, "pathname") == 0))
1182                         sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1183                 else if (field->flags & FIELD_IS_POINTER)
1184                         sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1185                 else if (strcmp(field->type, "pid_t") == 0)
1186                         sc->arg_fmt[idx].scnprintf = SCA_PID;
1187                 else if (strcmp(field->type, "umode_t") == 0)
1188                         sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1189                 else if ((strcmp(field->type, "int") == 0 ||
1190                           strcmp(field->type, "unsigned int") == 0 ||
1191                           strcmp(field->type, "long") == 0) &&
1192                          (len = strlen(field->name)) >= 2 &&
1193                          strcmp(field->name + len - 2, "fd") == 0) {
1194                         /*
1195                          * /sys/kernel/tracing/events/syscalls/sys_enter*
1196                          * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1197                          * 65 int
1198                          * 23 unsigned int
1199                          * 7 unsigned long
1200                          */
1201                         sc->arg_fmt[idx].scnprintf = SCA_FD;
1202                 }
1203         }
1204
1205         return 0;
1206 }
1207
1208 static int trace__read_syscall_info(struct trace *trace, int id)
1209 {
1210         char tp_name[128];
1211         struct syscall *sc;
1212         const char *name = syscalltbl__name(trace->sctbl, id);
1213
1214         if (name == NULL)
1215                 return -1;
1216
1217         if (id > trace->syscalls.max) {
1218                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1219
1220                 if (nsyscalls == NULL)
1221                         return -1;
1222
1223                 if (trace->syscalls.max != -1) {
1224                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1225                                (id - trace->syscalls.max) * sizeof(*sc));
1226                 } else {
1227                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1228                 }
1229
1230                 trace->syscalls.table = nsyscalls;
1231                 trace->syscalls.max   = id;
1232         }
1233
1234         sc = trace->syscalls.table + id;
1235         sc->name = name;
1236
1237         sc->fmt  = syscall_fmt__find(sc->name);
1238
1239         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1240         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1241
1242         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1243                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1244                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1245         }
1246
1247         if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1248                 return -1;
1249
1250         if (IS_ERR(sc->tp_format))
1251                 return -1;
1252
1253         sc->args = sc->tp_format->format.fields;
1254         /*
1255          * We need to check and discard the first variable '__syscall_nr'
1256          * or 'nr' that mean the syscall number. It is needless here.
1257          * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1258          */
1259         if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1260                 sc->args = sc->args->next;
1261                 --sc->nr_args;
1262         }
1263
1264         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1265
1266         return syscall__set_arg_fmts(sc);
1267 }
1268
1269 static int trace__validate_ev_qualifier(struct trace *trace)
1270 {
1271         int err = 0, i;
1272         size_t nr_allocated;
1273         struct str_node *pos;
1274
1275         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1276         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1277                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1278
1279         if (trace->ev_qualifier_ids.entries == NULL) {
1280                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1281                        trace->output);
1282                 err = -EINVAL;
1283                 goto out;
1284         }
1285
1286         nr_allocated = trace->ev_qualifier_ids.nr;
1287         i = 0;
1288
1289         strlist__for_each_entry(pos, trace->ev_qualifier) {
1290                 const char *sc = pos->s;
1291                 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1292
1293                 if (id < 0) {
1294                         id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1295                         if (id >= 0)
1296                                 goto matches;
1297
1298                         if (err == 0) {
1299                                 fputs("Error:\tInvalid syscall ", trace->output);
1300                                 err = -EINVAL;
1301                         } else {
1302                                 fputs(", ", trace->output);
1303                         }
1304
1305                         fputs(sc, trace->output);
1306                 }
1307 matches:
1308                 trace->ev_qualifier_ids.entries[i++] = id;
1309                 if (match_next == -1)
1310                         continue;
1311
1312                 while (1) {
1313                         id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1314                         if (id < 0)
1315                                 break;
1316                         if (nr_allocated == trace->ev_qualifier_ids.nr) {
1317                                 void *entries;
1318
1319                                 nr_allocated += 8;
1320                                 entries = realloc(trace->ev_qualifier_ids.entries,
1321                                                   nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1322                                 if (entries == NULL) {
1323                                         err = -ENOMEM;
1324                                         fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1325                                         goto out_free;
1326                                 }
1327                                 trace->ev_qualifier_ids.entries = entries;
1328                         }
1329                         trace->ev_qualifier_ids.nr++;
1330                         trace->ev_qualifier_ids.entries[i++] = id;
1331                 }
1332         }
1333
1334         if (err < 0) {
1335                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1336                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1337 out_free:
1338                 zfree(&trace->ev_qualifier_ids.entries);
1339                 trace->ev_qualifier_ids.nr = 0;
1340         }
1341 out:
1342         return err;
1343 }
1344
1345 /*
1346  * args is to be interpreted as a series of longs but we need to handle
1347  * 8-byte unaligned accesses. args points to raw_data within the event
1348  * and raw_data is guaranteed to be 8-byte unaligned because it is
1349  * preceded by raw_size which is a u32. So we need to copy args to a temp
1350  * variable to read it. Most notably this avoids extended load instructions
1351  * on unaligned addresses
1352  */
1353 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1354 {
1355         unsigned long val;
1356         unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1357
1358         memcpy(&val, p, sizeof(val));
1359         return val;
1360 }
1361
1362 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1363                                       struct syscall_arg *arg)
1364 {
1365         if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1366                 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1367
1368         return scnprintf(bf, size, "arg%d: ", arg->idx);
1369 }
1370
1371 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1372                                      struct syscall_arg *arg, unsigned long val)
1373 {
1374         if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1375                 arg->val = val;
1376                 if (sc->arg_fmt[arg->idx].parm)
1377                         arg->parm = sc->arg_fmt[arg->idx].parm;
1378                 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1379         }
1380         return scnprintf(bf, size, "%ld", val);
1381 }
1382
1383 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1384                                       unsigned char *args, struct trace *trace,
1385                                       struct thread *thread)
1386 {
1387         size_t printed = 0;
1388         unsigned long val;
1389         u8 bit = 1;
1390         struct syscall_arg arg = {
1391                 .args   = args,
1392                 .idx    = 0,
1393                 .mask   = 0,
1394                 .trace  = trace,
1395                 .thread = thread,
1396         };
1397         struct thread_trace *ttrace = thread__priv(thread);
1398
1399         /*
1400          * Things like fcntl will set this in its 'cmd' formatter to pick the
1401          * right formatter for the return value (an fd? file flags?), which is
1402          * not needed for syscalls that always return a given type, say an fd.
1403          */
1404         ttrace->ret_scnprintf = NULL;
1405
1406         if (sc->args != NULL) {
1407                 struct format_field *field;
1408
1409                 for (field = sc->args; field;
1410                      field = field->next, ++arg.idx, bit <<= 1) {
1411                         if (arg.mask & bit)
1412                                 continue;
1413
1414                         val = syscall_arg__val(&arg, arg.idx);
1415
1416                         /*
1417                          * Suppress this argument if its value is zero and
1418                          * and we don't have a string associated in an
1419                          * strarray for it.
1420                          */
1421                         if (val == 0 &&
1422                             !(sc->arg_fmt &&
1423                               (sc->arg_fmt[arg.idx].show_zero ||
1424                                sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1425                                sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1426                               sc->arg_fmt[arg.idx].parm))
1427                                 continue;
1428
1429                         printed += scnprintf(bf + printed, size - printed,
1430                                              "%s%s: ", printed ? ", " : "", field->name);
1431                         printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1432                 }
1433         } else if (IS_ERR(sc->tp_format)) {
1434                 /*
1435                  * If we managed to read the tracepoint /format file, then we
1436                  * may end up not having any args, like with gettid(), so only
1437                  * print the raw args when we didn't manage to read it.
1438                  */
1439                 while (arg.idx < sc->nr_args) {
1440                         if (arg.mask & bit)
1441                                 goto next_arg;
1442                         val = syscall_arg__val(&arg, arg.idx);
1443                         if (printed)
1444                                 printed += scnprintf(bf + printed, size - printed, ", ");
1445                         printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1446                         printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1447 next_arg:
1448                         ++arg.idx;
1449                         bit <<= 1;
1450                 }
1451         }
1452
1453         return printed;
1454 }
1455
1456 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1457                                   union perf_event *event,
1458                                   struct perf_sample *sample);
1459
1460 static struct syscall *trace__syscall_info(struct trace *trace,
1461                                            struct perf_evsel *evsel, int id)
1462 {
1463
1464         if (id < 0) {
1465
1466                 /*
1467                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1468                  * before that, leaving at a higher verbosity level till that is
1469                  * explained. Reproduced with plain ftrace with:
1470                  *
1471                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1472                  * grep "NR -1 " /t/trace_pipe
1473                  *
1474                  * After generating some load on the machine.
1475                  */
1476                 if (verbose > 1) {
1477                         static u64 n;
1478                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1479                                 id, perf_evsel__name(evsel), ++n);
1480                 }
1481                 return NULL;
1482         }
1483
1484         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1485             trace__read_syscall_info(trace, id))
1486                 goto out_cant_read;
1487
1488         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1489                 goto out_cant_read;
1490
1491         return &trace->syscalls.table[id];
1492
1493 out_cant_read:
1494         if (verbose > 0) {
1495                 fprintf(trace->output, "Problems reading syscall %d", id);
1496                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1497                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1498                 fputs(" information\n", trace->output);
1499         }
1500         return NULL;
1501 }
1502
1503 static void thread__update_stats(struct thread_trace *ttrace,
1504                                  int id, struct perf_sample *sample)
1505 {
1506         struct int_node *inode;
1507         struct stats *stats;
1508         u64 duration = 0;
1509
1510         inode = intlist__findnew(ttrace->syscall_stats, id);
1511         if (inode == NULL)
1512                 return;
1513
1514         stats = inode->priv;
1515         if (stats == NULL) {
1516                 stats = malloc(sizeof(struct stats));
1517                 if (stats == NULL)
1518                         return;
1519                 init_stats(stats);
1520                 inode->priv = stats;
1521         }
1522
1523         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1524                 duration = sample->time - ttrace->entry_time;
1525
1526         update_stats(stats, duration);
1527 }
1528
1529 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1530 {
1531         struct thread_trace *ttrace;
1532         u64 duration;
1533         size_t printed;
1534
1535         if (trace->current == NULL)
1536                 return 0;
1537
1538         ttrace = thread__priv(trace->current);
1539
1540         if (!ttrace->entry_pending)
1541                 return 0;
1542
1543         duration = sample->time - ttrace->entry_time;
1544
1545         printed  = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1546         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1547         ttrace->entry_pending = false;
1548
1549         return printed;
1550 }
1551
1552 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1553                             union perf_event *event __maybe_unused,
1554                             struct perf_sample *sample)
1555 {
1556         char *msg;
1557         void *args;
1558         size_t printed = 0;
1559         struct thread *thread;
1560         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1561         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1562         struct thread_trace *ttrace;
1563
1564         if (sc == NULL)
1565                 return -1;
1566
1567         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1568         ttrace = thread__trace(thread, trace->output);
1569         if (ttrace == NULL)
1570                 goto out_put;
1571
1572         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1573
1574         if (ttrace->entry_str == NULL) {
1575                 ttrace->entry_str = malloc(trace__entry_str_size);
1576                 if (!ttrace->entry_str)
1577                         goto out_put;
1578         }
1579
1580         if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1581                 trace__printf_interrupted_entry(trace, sample);
1582
1583         ttrace->entry_time = sample->time;
1584         msg = ttrace->entry_str;
1585         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1586
1587         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1588                                            args, trace, thread);
1589
1590         if (sc->is_exit) {
1591                 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1592                         trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1593                         fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1594                 }
1595         } else {
1596                 ttrace->entry_pending = true;
1597                 /* See trace__vfs_getname & trace__sys_exit */
1598                 ttrace->filename.pending_open = false;
1599         }
1600
1601         if (trace->current != thread) {
1602                 thread__put(trace->current);
1603                 trace->current = thread__get(thread);
1604         }
1605         err = 0;
1606 out_put:
1607         thread__put(thread);
1608         return err;
1609 }
1610
1611 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1612                                     struct perf_sample *sample,
1613                                     struct callchain_cursor *cursor)
1614 {
1615         struct addr_location al;
1616
1617         if (machine__resolve(trace->host, &al, sample) < 0 ||
1618             thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1619                 return -1;
1620
1621         return 0;
1622 }
1623
1624 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1625 {
1626         /* TODO: user-configurable print_opts */
1627         const unsigned int print_opts = EVSEL__PRINT_SYM |
1628                                         EVSEL__PRINT_DSO |
1629                                         EVSEL__PRINT_UNKNOWN_AS_ADDR;
1630
1631         return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1632 }
1633
1634 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1635                            union perf_event *event __maybe_unused,
1636                            struct perf_sample *sample)
1637 {
1638         long ret;
1639         u64 duration = 0;
1640         bool duration_calculated = false;
1641         struct thread *thread;
1642         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1643         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1644         struct thread_trace *ttrace;
1645
1646         if (sc == NULL)
1647                 return -1;
1648
1649         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1650         ttrace = thread__trace(thread, trace->output);
1651         if (ttrace == NULL)
1652                 goto out_put;
1653
1654         if (trace->summary)
1655                 thread__update_stats(ttrace, id, sample);
1656
1657         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1658
1659         if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1660                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1661                 ttrace->filename.pending_open = false;
1662                 ++trace->stats.vfs_getname;
1663         }
1664
1665         if (ttrace->entry_time) {
1666                 duration = sample->time - ttrace->entry_time;
1667                 if (trace__filter_duration(trace, duration))
1668                         goto out;
1669                 duration_calculated = true;
1670         } else if (trace->duration_filter)
1671                 goto out;
1672
1673         if (sample->callchain) {
1674                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1675                 if (callchain_ret == 0) {
1676                         if (callchain_cursor.nr < trace->min_stack)
1677                                 goto out;
1678                         callchain_ret = 1;
1679                 }
1680         }
1681
1682         if (trace->summary_only)
1683                 goto out;
1684
1685         trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1686
1687         if (ttrace->entry_pending) {
1688                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1689         } else {
1690                 fprintf(trace->output, " ... [");
1691                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1692                 fprintf(trace->output, "]: %s()", sc->name);
1693         }
1694
1695         if (sc->fmt == NULL) {
1696                 if (ret < 0)
1697                         goto errno_print;
1698 signed_print:
1699                 fprintf(trace->output, ") = %ld", ret);
1700         } else if (ret < 0) {
1701 errno_print: {
1702                 char bf[STRERR_BUFSIZE];
1703                 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1704                            *e = audit_errno_to_name(-ret);
1705
1706                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1707         }
1708         } else if (ret == 0 && sc->fmt->timeout)
1709                 fprintf(trace->output, ") = 0 Timeout");
1710         else if (ttrace->ret_scnprintf) {
1711                 char bf[1024];
1712                 struct syscall_arg arg = {
1713                         .val    = ret,
1714                         .thread = thread,
1715                         .trace  = trace,
1716                 };
1717                 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1718                 ttrace->ret_scnprintf = NULL;
1719                 fprintf(trace->output, ") = %s", bf);
1720         } else if (sc->fmt->hexret)
1721                 fprintf(trace->output, ") = %#lx", ret);
1722         else if (sc->fmt->errpid) {
1723                 struct thread *child = machine__find_thread(trace->host, ret, ret);
1724
1725                 if (child != NULL) {
1726                         fprintf(trace->output, ") = %ld", ret);
1727                         if (child->comm_set)
1728                                 fprintf(trace->output, " (%s)", thread__comm_str(child));
1729                         thread__put(child);
1730                 }
1731         } else
1732                 goto signed_print;
1733
1734         fputc('\n', trace->output);
1735
1736         if (callchain_ret > 0)
1737                 trace__fprintf_callchain(trace, sample);
1738         else if (callchain_ret < 0)
1739                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1740 out:
1741         ttrace->entry_pending = false;
1742         err = 0;
1743 out_put:
1744         thread__put(thread);
1745         return err;
1746 }
1747
1748 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1749                               union perf_event *event __maybe_unused,
1750                               struct perf_sample *sample)
1751 {
1752         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1753         struct thread_trace *ttrace;
1754         size_t filename_len, entry_str_len, to_move;
1755         ssize_t remaining_space;
1756         char *pos;
1757         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1758
1759         if (!thread)
1760                 goto out;
1761
1762         ttrace = thread__priv(thread);
1763         if (!ttrace)
1764                 goto out_put;
1765
1766         filename_len = strlen(filename);
1767         if (filename_len == 0)
1768                 goto out_put;
1769
1770         if (ttrace->filename.namelen < filename_len) {
1771                 char *f = realloc(ttrace->filename.name, filename_len + 1);
1772
1773                 if (f == NULL)
1774                         goto out_put;
1775
1776                 ttrace->filename.namelen = filename_len;
1777                 ttrace->filename.name = f;
1778         }
1779
1780         strcpy(ttrace->filename.name, filename);
1781         ttrace->filename.pending_open = true;
1782
1783         if (!ttrace->filename.ptr)
1784                 goto out_put;
1785
1786         entry_str_len = strlen(ttrace->entry_str);
1787         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1788         if (remaining_space <= 0)
1789                 goto out_put;
1790
1791         if (filename_len > (size_t)remaining_space) {
1792                 filename += filename_len - remaining_space;
1793                 filename_len = remaining_space;
1794         }
1795
1796         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1797         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1798         memmove(pos + filename_len, pos, to_move);
1799         memcpy(pos, filename, filename_len);
1800
1801         ttrace->filename.ptr = 0;
1802         ttrace->filename.entry_str_pos = 0;
1803 out_put:
1804         thread__put(thread);
1805 out:
1806         return 0;
1807 }
1808
1809 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1810                                      union perf_event *event __maybe_unused,
1811                                      struct perf_sample *sample)
1812 {
1813         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1814         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1815         struct thread *thread = machine__findnew_thread(trace->host,
1816                                                         sample->pid,
1817                                                         sample->tid);
1818         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1819
1820         if (ttrace == NULL)
1821                 goto out_dump;
1822
1823         ttrace->runtime_ms += runtime_ms;
1824         trace->runtime_ms += runtime_ms;
1825 out_put:
1826         thread__put(thread);
1827         return 0;
1828
1829 out_dump:
1830         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1831                evsel->name,
1832                perf_evsel__strval(evsel, sample, "comm"),
1833                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1834                runtime,
1835                perf_evsel__intval(evsel, sample, "vruntime"));
1836         goto out_put;
1837 }
1838
1839 static void bpf_output__printer(enum binary_printer_ops op,
1840                                 unsigned int val, void *extra)
1841 {
1842         FILE *output = extra;
1843         unsigned char ch = (unsigned char)val;
1844
1845         switch (op) {
1846         case BINARY_PRINT_CHAR_DATA:
1847                 fprintf(output, "%c", isprint(ch) ? ch : '.');
1848                 break;
1849         case BINARY_PRINT_DATA_BEGIN:
1850         case BINARY_PRINT_LINE_BEGIN:
1851         case BINARY_PRINT_ADDR:
1852         case BINARY_PRINT_NUM_DATA:
1853         case BINARY_PRINT_NUM_PAD:
1854         case BINARY_PRINT_SEP:
1855         case BINARY_PRINT_CHAR_PAD:
1856         case BINARY_PRINT_LINE_END:
1857         case BINARY_PRINT_DATA_END:
1858         default:
1859                 break;
1860         }
1861 }
1862
1863 static void bpf_output__fprintf(struct trace *trace,
1864                                 struct perf_sample *sample)
1865 {
1866         print_binary(sample->raw_data, sample->raw_size, 8,
1867                      bpf_output__printer, trace->output);
1868 }
1869
1870 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1871                                 union perf_event *event __maybe_unused,
1872                                 struct perf_sample *sample)
1873 {
1874         int callchain_ret = 0;
1875
1876         if (sample->callchain) {
1877                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1878                 if (callchain_ret == 0) {
1879                         if (callchain_cursor.nr < trace->min_stack)
1880                                 goto out;
1881                         callchain_ret = 1;
1882                 }
1883         }
1884
1885         trace__printf_interrupted_entry(trace, sample);
1886         trace__fprintf_tstamp(trace, sample->time, trace->output);
1887
1888         if (trace->trace_syscalls)
1889                 fprintf(trace->output, "(         ): ");
1890
1891         fprintf(trace->output, "%s:", evsel->name);
1892
1893         if (perf_evsel__is_bpf_output(evsel)) {
1894                 bpf_output__fprintf(trace, sample);
1895         } else if (evsel->tp_format) {
1896                 event_format__fprintf(evsel->tp_format, sample->cpu,
1897                                       sample->raw_data, sample->raw_size,
1898                                       trace->output);
1899         }
1900
1901         fprintf(trace->output, ")\n");
1902
1903         if (callchain_ret > 0)
1904                 trace__fprintf_callchain(trace, sample);
1905         else if (callchain_ret < 0)
1906                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1907 out:
1908         return 0;
1909 }
1910
1911 static void print_location(FILE *f, struct perf_sample *sample,
1912                            struct addr_location *al,
1913                            bool print_dso, bool print_sym)
1914 {
1915
1916         if ((verbose > 0 || print_dso) && al->map)
1917                 fprintf(f, "%s@", al->map->dso->long_name);
1918
1919         if ((verbose > 0 || print_sym) && al->sym)
1920                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1921                         al->addr - al->sym->start);
1922         else if (al->map)
1923                 fprintf(f, "0x%" PRIx64, al->addr);
1924         else
1925                 fprintf(f, "0x%" PRIx64, sample->addr);
1926 }
1927
1928 static int trace__pgfault(struct trace *trace,
1929                           struct perf_evsel *evsel,
1930                           union perf_event *event __maybe_unused,
1931                           struct perf_sample *sample)
1932 {
1933         struct thread *thread;
1934         struct addr_location al;
1935         char map_type = 'd';
1936         struct thread_trace *ttrace;
1937         int err = -1;
1938         int callchain_ret = 0;
1939
1940         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1941
1942         if (sample->callchain) {
1943                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1944                 if (callchain_ret == 0) {
1945                         if (callchain_cursor.nr < trace->min_stack)
1946                                 goto out_put;
1947                         callchain_ret = 1;
1948                 }
1949         }
1950
1951         ttrace = thread__trace(thread, trace->output);
1952         if (ttrace == NULL)
1953                 goto out_put;
1954
1955         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1956                 ttrace->pfmaj++;
1957         else
1958                 ttrace->pfmin++;
1959
1960         if (trace->summary_only)
1961                 goto out;
1962
1963         thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1964                               sample->ip, &al);
1965
1966         trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1967
1968         fprintf(trace->output, "%sfault [",
1969                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1970                 "maj" : "min");
1971
1972         print_location(trace->output, sample, &al, false, true);
1973
1974         fprintf(trace->output, "] => ");
1975
1976         thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1977                                    sample->addr, &al);
1978
1979         if (!al.map) {
1980                 thread__find_addr_location(thread, sample->cpumode,
1981                                            MAP__FUNCTION, sample->addr, &al);
1982
1983                 if (al.map)
1984                         map_type = 'x';
1985                 else
1986                         map_type = '?';
1987         }
1988
1989         print_location(trace->output, sample, &al, true, false);
1990
1991         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1992
1993         if (callchain_ret > 0)
1994                 trace__fprintf_callchain(trace, sample);
1995         else if (callchain_ret < 0)
1996                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1997 out:
1998         err = 0;
1999 out_put:
2000         thread__put(thread);
2001         return err;
2002 }
2003
2004 static void trace__set_base_time(struct trace *trace,
2005                                  struct perf_evsel *evsel,
2006                                  struct perf_sample *sample)
2007 {
2008         /*
2009          * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2010          * and don't use sample->time unconditionally, we may end up having
2011          * some other event in the future without PERF_SAMPLE_TIME for good
2012          * reason, i.e. we may not be interested in its timestamps, just in
2013          * it taking place, picking some piece of information when it
2014          * appears in our event stream (vfs_getname comes to mind).
2015          */
2016         if (trace->base_time == 0 && !trace->full_time &&
2017             (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2018                 trace->base_time = sample->time;
2019 }
2020
2021 static int trace__process_sample(struct perf_tool *tool,
2022                                  union perf_event *event,
2023                                  struct perf_sample *sample,
2024                                  struct perf_evsel *evsel,
2025                                  struct machine *machine __maybe_unused)
2026 {
2027         struct trace *trace = container_of(tool, struct trace, tool);
2028         struct thread *thread;
2029         int err = 0;
2030
2031         tracepoint_handler handler = evsel->handler;
2032
2033         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2034         if (thread && thread__is_filtered(thread))
2035                 goto out;
2036
2037         trace__set_base_time(trace, evsel, sample);
2038
2039         if (handler) {
2040                 ++trace->nr_events;
2041                 handler(trace, evsel, event, sample);
2042         }
2043 out:
2044         thread__put(thread);
2045         return err;
2046 }
2047
2048 static int trace__record(struct trace *trace, int argc, const char **argv)
2049 {
2050         unsigned int rec_argc, i, j;
2051         const char **rec_argv;
2052         const char * const record_args[] = {
2053                 "record",
2054                 "-R",
2055                 "-m", "1024",
2056                 "-c", "1",
2057         };
2058
2059         const char * const sc_args[] = { "-e", };
2060         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2061         const char * const majpf_args[] = { "-e", "major-faults" };
2062         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2063         const char * const minpf_args[] = { "-e", "minor-faults" };
2064         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2065
2066         /* +1 is for the event string below */
2067         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2068                 majpf_args_nr + minpf_args_nr + argc;
2069         rec_argv = calloc(rec_argc + 1, sizeof(char *));
2070
2071         if (rec_argv == NULL)
2072                 return -ENOMEM;
2073
2074         j = 0;
2075         for (i = 0; i < ARRAY_SIZE(record_args); i++)
2076                 rec_argv[j++] = record_args[i];
2077
2078         if (trace->trace_syscalls) {
2079                 for (i = 0; i < sc_args_nr; i++)
2080                         rec_argv[j++] = sc_args[i];
2081
2082                 /* event string may be different for older kernels - e.g., RHEL6 */
2083                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2084                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2085                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2086                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2087                 else {
2088                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2089                         free(rec_argv);
2090                         return -1;
2091                 }
2092         }
2093
2094         if (trace->trace_pgfaults & TRACE_PFMAJ)
2095                 for (i = 0; i < majpf_args_nr; i++)
2096                         rec_argv[j++] = majpf_args[i];
2097
2098         if (trace->trace_pgfaults & TRACE_PFMIN)
2099                 for (i = 0; i < minpf_args_nr; i++)
2100                         rec_argv[j++] = minpf_args[i];
2101
2102         for (i = 0; i < (unsigned int)argc; i++)
2103                 rec_argv[j++] = argv[i];
2104
2105         return cmd_record(j, rec_argv);
2106 }
2107
2108 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2109
2110 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2111 {
2112         bool found = false;
2113         struct perf_evsel *evsel, *tmp;
2114         struct parse_events_error err = { .idx = 0, };
2115         int ret = parse_events(evlist, "probe:vfs_getname*", &err);
2116
2117         if (ret)
2118                 return false;
2119
2120         evlist__for_each_entry_safe(evlist, evsel, tmp) {
2121                 if (!strstarts(perf_evsel__name(evsel), "probe:vfs_getname"))
2122                         continue;
2123
2124                 if (perf_evsel__field(evsel, "pathname")) {
2125                         evsel->handler = trace__vfs_getname;
2126                         found = true;
2127                         continue;
2128                 }
2129
2130                 list_del_init(&evsel->node);
2131                 evsel->evlist = NULL;
2132                 perf_evsel__delete(evsel);
2133         }
2134
2135         return found;
2136 }
2137
2138 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2139 {
2140         struct perf_evsel *evsel;
2141         struct perf_event_attr attr = {
2142                 .type = PERF_TYPE_SOFTWARE,
2143                 .mmap_data = 1,
2144         };
2145
2146         attr.config = config;
2147         attr.sample_period = 1;
2148
2149         event_attr_init(&attr);
2150
2151         evsel = perf_evsel__new(&attr);
2152         if (evsel)
2153                 evsel->handler = trace__pgfault;
2154
2155         return evsel;
2156 }
2157
2158 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2159 {
2160         const u32 type = event->header.type;
2161         struct perf_evsel *evsel;
2162
2163         if (type != PERF_RECORD_SAMPLE) {
2164                 trace__process_event(trace, trace->host, event, sample);
2165                 return;
2166         }
2167
2168         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2169         if (evsel == NULL) {
2170                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2171                 return;
2172         }
2173
2174         trace__set_base_time(trace, evsel, sample);
2175
2176         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2177             sample->raw_data == NULL) {
2178                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2179                        perf_evsel__name(evsel), sample->tid,
2180                        sample->cpu, sample->raw_size);
2181         } else {
2182                 tracepoint_handler handler = evsel->handler;
2183                 handler(trace, evsel, event, sample);
2184         }
2185 }
2186
2187 static int trace__add_syscall_newtp(struct trace *trace)
2188 {
2189         int ret = -1;
2190         struct perf_evlist *evlist = trace->evlist;
2191         struct perf_evsel *sys_enter, *sys_exit;
2192
2193         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2194         if (sys_enter == NULL)
2195                 goto out;
2196
2197         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2198                 goto out_delete_sys_enter;
2199
2200         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2201         if (sys_exit == NULL)
2202                 goto out_delete_sys_enter;
2203
2204         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2205                 goto out_delete_sys_exit;
2206
2207         perf_evlist__add(evlist, sys_enter);
2208         perf_evlist__add(evlist, sys_exit);
2209
2210         if (callchain_param.enabled && !trace->kernel_syscallchains) {
2211                 /*
2212                  * We're interested only in the user space callchain
2213                  * leading to the syscall, allow overriding that for
2214                  * debugging reasons using --kernel_syscall_callchains
2215                  */
2216                 sys_exit->attr.exclude_callchain_kernel = 1;
2217         }
2218
2219         trace->syscalls.events.sys_enter = sys_enter;
2220         trace->syscalls.events.sys_exit  = sys_exit;
2221
2222         ret = 0;
2223 out:
2224         return ret;
2225
2226 out_delete_sys_exit:
2227         perf_evsel__delete_priv(sys_exit);
2228 out_delete_sys_enter:
2229         perf_evsel__delete_priv(sys_enter);
2230         goto out;
2231 }
2232
2233 static int trace__set_ev_qualifier_filter(struct trace *trace)
2234 {
2235         int err = -1;
2236         struct perf_evsel *sys_exit;
2237         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2238                                                 trace->ev_qualifier_ids.nr,
2239                                                 trace->ev_qualifier_ids.entries);
2240
2241         if (filter == NULL)
2242                 goto out_enomem;
2243
2244         if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2245                                           filter)) {
2246                 sys_exit = trace->syscalls.events.sys_exit;
2247                 err = perf_evsel__append_tp_filter(sys_exit, filter);
2248         }
2249
2250         free(filter);
2251 out:
2252         return err;
2253 out_enomem:
2254         errno = ENOMEM;
2255         goto out;
2256 }
2257
2258 static int trace__set_filter_loop_pids(struct trace *trace)
2259 {
2260         unsigned int nr = 1;
2261         pid_t pids[32] = {
2262                 getpid(),
2263         };
2264         struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2265
2266         while (thread && nr < ARRAY_SIZE(pids)) {
2267                 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2268
2269                 if (parent == NULL)
2270                         break;
2271
2272                 if (!strcmp(thread__comm_str(parent), "sshd")) {
2273                         pids[nr++] = parent->tid;
2274                         break;
2275                 }
2276                 thread = parent;
2277         }
2278
2279         return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2280 }
2281
2282 static int trace__run(struct trace *trace, int argc, const char **argv)
2283 {
2284         struct perf_evlist *evlist = trace->evlist;
2285         struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2286         int err = -1, i;
2287         unsigned long before;
2288         const bool forks = argc > 0;
2289         bool draining = false;
2290
2291         trace->live = true;
2292
2293         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2294                 goto out_error_raw_syscalls;
2295
2296         if (trace->trace_syscalls)
2297                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2298
2299         if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2300                 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2301                 if (pgfault_maj == NULL)
2302                         goto out_error_mem;
2303                 perf_evlist__add(evlist, pgfault_maj);
2304         }
2305
2306         if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2307                 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2308                 if (pgfault_min == NULL)
2309                         goto out_error_mem;
2310                 perf_evlist__add(evlist, pgfault_min);
2311         }
2312
2313         if (trace->sched &&
2314             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2315                                    trace__sched_stat_runtime))
2316                 goto out_error_sched_stat_runtime;
2317
2318         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2319         if (err < 0) {
2320                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2321                 goto out_delete_evlist;
2322         }
2323
2324         err = trace__symbols_init(trace, evlist);
2325         if (err < 0) {
2326                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2327                 goto out_delete_evlist;
2328         }
2329
2330         perf_evlist__config(evlist, &trace->opts, NULL);
2331
2332         if (callchain_param.enabled) {
2333                 bool use_identifier = false;
2334
2335                 if (trace->syscalls.events.sys_exit) {
2336                         perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2337                                                      &trace->opts, &callchain_param);
2338                         use_identifier = true;
2339                 }
2340
2341                 if (pgfault_maj) {
2342                         perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2343                         use_identifier = true;
2344                 }
2345
2346                 if (pgfault_min) {
2347                         perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2348                         use_identifier = true;
2349                 }
2350
2351                 if (use_identifier) {
2352                        /*
2353                         * Now we have evsels with different sample_ids, use
2354                         * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2355                         * from a fixed position in each ring buffer record.
2356                         *
2357                         * As of this the changeset introducing this comment, this
2358                         * isn't strictly needed, as the fields that can come before
2359                         * PERF_SAMPLE_ID are all used, but we'll probably disable
2360                         * some of those for things like copying the payload of
2361                         * pointer syscall arguments, and for vfs_getname we don't
2362                         * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2363                         * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2364                         */
2365                         perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2366                         perf_evlist__reset_sample_bit(evlist, ID);
2367                 }
2368         }
2369
2370         signal(SIGCHLD, sig_handler);
2371         signal(SIGINT, sig_handler);
2372
2373         if (forks) {
2374                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2375                                                     argv, false, NULL);
2376                 if (err < 0) {
2377                         fprintf(trace->output, "Couldn't run the workload!\n");
2378                         goto out_delete_evlist;
2379                 }
2380         }
2381
2382         err = perf_evlist__open(evlist);
2383         if (err < 0)
2384                 goto out_error_open;
2385
2386         err = bpf__apply_obj_config();
2387         if (err) {
2388                 char errbuf[BUFSIZ];
2389
2390                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2391                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2392                          errbuf);
2393                 goto out_error_open;
2394         }
2395
2396         /*
2397          * Better not use !target__has_task() here because we need to cover the
2398          * case where no threads were specified in the command line, but a
2399          * workload was, and in that case we will fill in the thread_map when
2400          * we fork the workload in perf_evlist__prepare_workload.
2401          */
2402         if (trace->filter_pids.nr > 0)
2403                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2404         else if (thread_map__pid(evlist->threads, 0) == -1)
2405                 err = trace__set_filter_loop_pids(trace);
2406
2407         if (err < 0)
2408                 goto out_error_mem;
2409
2410         if (trace->ev_qualifier_ids.nr > 0) {
2411                 err = trace__set_ev_qualifier_filter(trace);
2412                 if (err < 0)
2413                         goto out_errno;
2414
2415                 pr_debug("event qualifier tracepoint filter: %s\n",
2416                          trace->syscalls.events.sys_exit->filter);
2417         }
2418
2419         err = perf_evlist__apply_filters(evlist, &evsel);
2420         if (err < 0)
2421                 goto out_error_apply_filters;
2422
2423         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2424         if (err < 0)
2425                 goto out_error_mmap;
2426
2427         if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2428                 perf_evlist__enable(evlist);
2429
2430         if (forks)
2431                 perf_evlist__start_workload(evlist);
2432
2433         if (trace->opts.initial_delay) {
2434                 usleep(trace->opts.initial_delay * 1000);
2435                 perf_evlist__enable(evlist);
2436         }
2437
2438         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2439                                   evlist->threads->nr > 1 ||
2440                                   perf_evlist__first(evlist)->attr.inherit;
2441 again:
2442         before = trace->nr_events;
2443
2444         for (i = 0; i < evlist->nr_mmaps; i++) {
2445                 union perf_event *event;
2446
2447                 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2448                         struct perf_sample sample;
2449
2450                         ++trace->nr_events;
2451
2452                         err = perf_evlist__parse_sample(evlist, event, &sample);
2453                         if (err) {
2454                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2455                                 goto next_event;
2456                         }
2457
2458                         trace__handle_event(trace, event, &sample);
2459 next_event:
2460                         perf_evlist__mmap_consume(evlist, i);
2461
2462                         if (interrupted)
2463                                 goto out_disable;
2464
2465                         if (done && !draining) {
2466                                 perf_evlist__disable(evlist);
2467                                 draining = true;
2468                         }
2469                 }
2470         }
2471
2472         if (trace->nr_events == before) {
2473                 int timeout = done ? 100 : -1;
2474
2475                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2476                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2477                                 draining = true;
2478
2479                         goto again;
2480                 }
2481         } else {
2482                 goto again;
2483         }
2484
2485 out_disable:
2486         thread__zput(trace->current);
2487
2488         perf_evlist__disable(evlist);
2489
2490         if (!err) {
2491                 if (trace->summary)
2492                         trace__fprintf_thread_summary(trace, trace->output);
2493
2494                 if (trace->show_tool_stats) {
2495                         fprintf(trace->output, "Stats:\n "
2496                                                " vfs_getname : %" PRIu64 "\n"
2497                                                " proc_getname: %" PRIu64 "\n",
2498                                 trace->stats.vfs_getname,
2499                                 trace->stats.proc_getname);
2500                 }
2501         }
2502
2503 out_delete_evlist:
2504         trace__symbols__exit(trace);
2505
2506         perf_evlist__delete(evlist);
2507         trace->evlist = NULL;
2508         trace->live = false;
2509         return err;
2510 {
2511         char errbuf[BUFSIZ];
2512
2513 out_error_sched_stat_runtime:
2514         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2515         goto out_error;
2516
2517 out_error_raw_syscalls:
2518         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2519         goto out_error;
2520
2521 out_error_mmap:
2522         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2523         goto out_error;
2524
2525 out_error_open:
2526         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2527
2528 out_error:
2529         fprintf(trace->output, "%s\n", errbuf);
2530         goto out_delete_evlist;
2531
2532 out_error_apply_filters:
2533         fprintf(trace->output,
2534                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2535                 evsel->filter, perf_evsel__name(evsel), errno,
2536                 str_error_r(errno, errbuf, sizeof(errbuf)));
2537         goto out_delete_evlist;
2538 }
2539 out_error_mem:
2540         fprintf(trace->output, "Not enough memory to run!\n");
2541         goto out_delete_evlist;
2542
2543 out_errno:
2544         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2545         goto out_delete_evlist;
2546 }
2547
2548 static int trace__replay(struct trace *trace)
2549 {
2550         const struct perf_evsel_str_handler handlers[] = {
2551                 { "probe:vfs_getname",       trace__vfs_getname, },
2552         };
2553         struct perf_data_file file = {
2554                 .path  = input_name,
2555                 .mode  = PERF_DATA_MODE_READ,
2556                 .force = trace->force,
2557         };
2558         struct perf_session *session;
2559         struct perf_evsel *evsel;
2560         int err = -1;
2561
2562         trace->tool.sample        = trace__process_sample;
2563         trace->tool.mmap          = perf_event__process_mmap;
2564         trace->tool.mmap2         = perf_event__process_mmap2;
2565         trace->tool.comm          = perf_event__process_comm;
2566         trace->tool.exit          = perf_event__process_exit;
2567         trace->tool.fork          = perf_event__process_fork;
2568         trace->tool.attr          = perf_event__process_attr;
2569         trace->tool.tracing_data  = perf_event__process_tracing_data;
2570         trace->tool.build_id      = perf_event__process_build_id;
2571         trace->tool.namespaces    = perf_event__process_namespaces;
2572
2573         trace->tool.ordered_events = true;
2574         trace->tool.ordering_requires_timestamps = true;
2575
2576         /* add tid to output */
2577         trace->multiple_threads = true;
2578
2579         session = perf_session__new(&file, false, &trace->tool);
2580         if (session == NULL)
2581                 return -1;
2582
2583         if (trace->opts.target.pid)
2584                 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2585
2586         if (trace->opts.target.tid)
2587                 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2588
2589         if (symbol__init(&session->header.env) < 0)
2590                 goto out;
2591
2592         trace->host = &session->machines.host;
2593
2594         err = perf_session__set_tracepoints_handlers(session, handlers);
2595         if (err)
2596                 goto out;
2597
2598         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2599                                                      "raw_syscalls:sys_enter");
2600         /* older kernels have syscalls tp versus raw_syscalls */
2601         if (evsel == NULL)
2602                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2603                                                              "syscalls:sys_enter");
2604
2605         if (evsel &&
2606             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2607             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2608                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2609                 goto out;
2610         }
2611
2612         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2613                                                      "raw_syscalls:sys_exit");
2614         if (evsel == NULL)
2615                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2616                                                              "syscalls:sys_exit");
2617         if (evsel &&
2618             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2619             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2620                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2621                 goto out;
2622         }
2623
2624         evlist__for_each_entry(session->evlist, evsel) {
2625                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2626                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2627                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2628                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2629                         evsel->handler = trace__pgfault;
2630         }
2631
2632         setup_pager();
2633
2634         err = perf_session__process_events(session);
2635         if (err)
2636                 pr_err("Failed to process events, error %d", err);
2637
2638         else if (trace->summary)
2639                 trace__fprintf_thread_summary(trace, trace->output);
2640
2641 out:
2642         perf_session__delete(session);
2643
2644         return err;
2645 }
2646
2647 static size_t trace__fprintf_threads_header(FILE *fp)
2648 {
2649         size_t printed;
2650
2651         printed  = fprintf(fp, "\n Summary of events:\n\n");
2652
2653         return printed;
2654 }
2655
2656 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2657         struct stats    *stats;
2658         double          msecs;
2659         int             syscall;
2660 )
2661 {
2662         struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2663         struct stats *stats = source->priv;
2664
2665         entry->syscall = source->i;
2666         entry->stats   = stats;
2667         entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2668 }
2669
2670 static size_t thread__dump_stats(struct thread_trace *ttrace,
2671                                  struct trace *trace, FILE *fp)
2672 {
2673         size_t printed = 0;
2674         struct syscall *sc;
2675         struct rb_node *nd;
2676         DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2677
2678         if (syscall_stats == NULL)
2679                 return 0;
2680
2681         printed += fprintf(fp, "\n");
2682
2683         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2684         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2685         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2686
2687         resort_rb__for_each_entry(nd, syscall_stats) {
2688                 struct stats *stats = syscall_stats_entry->stats;
2689                 if (stats) {
2690                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2691                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2692                         double avg = avg_stats(stats);
2693                         double pct;
2694                         u64 n = (u64) stats->n;
2695
2696                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2697                         avg /= NSEC_PER_MSEC;
2698
2699                         sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2700                         printed += fprintf(fp, "   %-15s", sc->name);
2701                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2702                                            n, syscall_stats_entry->msecs, min, avg);
2703                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2704                 }
2705         }
2706
2707         resort_rb__delete(syscall_stats);
2708         printed += fprintf(fp, "\n\n");
2709
2710         return printed;
2711 }
2712
2713 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2714 {
2715         size_t printed = 0;
2716         struct thread_trace *ttrace = thread__priv(thread);
2717         double ratio;
2718
2719         if (ttrace == NULL)
2720                 return 0;
2721
2722         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2723
2724         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2725         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2726         printed += fprintf(fp, "%.1f%%", ratio);
2727         if (ttrace->pfmaj)
2728                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2729         if (ttrace->pfmin)
2730                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2731         if (trace->sched)
2732                 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2733         else if (fputc('\n', fp) != EOF)
2734                 ++printed;
2735
2736         printed += thread__dump_stats(ttrace, trace, fp);
2737
2738         return printed;
2739 }
2740
2741 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2742 {
2743         return ttrace ? ttrace->nr_events : 0;
2744 }
2745
2746 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2747         struct thread *thread;
2748 )
2749 {
2750         entry->thread = rb_entry(nd, struct thread, rb_node);
2751 }
2752
2753 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2754 {
2755         DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2756         size_t printed = trace__fprintf_threads_header(fp);
2757         struct rb_node *nd;
2758
2759         if (threads == NULL) {
2760                 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2761                 return 0;
2762         }
2763
2764         resort_rb__for_each_entry(nd, threads)
2765                 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2766
2767         resort_rb__delete(threads);
2768
2769         return printed;
2770 }
2771
2772 static int trace__set_duration(const struct option *opt, const char *str,
2773                                int unset __maybe_unused)
2774 {
2775         struct trace *trace = opt->value;
2776
2777         trace->duration_filter = atof(str);
2778         return 0;
2779 }
2780
2781 static int trace__set_filter_pids(const struct option *opt, const char *str,
2782                                   int unset __maybe_unused)
2783 {
2784         int ret = -1;
2785         size_t i;
2786         struct trace *trace = opt->value;
2787         /*
2788          * FIXME: introduce a intarray class, plain parse csv and create a
2789          * { int nr, int entries[] } struct...
2790          */
2791         struct intlist *list = intlist__new(str);
2792
2793         if (list == NULL)
2794                 return -1;
2795
2796         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2797         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2798
2799         if (trace->filter_pids.entries == NULL)
2800                 goto out;
2801
2802         trace->filter_pids.entries[0] = getpid();
2803
2804         for (i = 1; i < trace->filter_pids.nr; ++i)
2805                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2806
2807         intlist__delete(list);
2808         ret = 0;
2809 out:
2810         return ret;
2811 }
2812
2813 static int trace__open_output(struct trace *trace, const char *filename)
2814 {
2815         struct stat st;
2816
2817         if (!stat(filename, &st) && st.st_size) {
2818                 char oldname[PATH_MAX];
2819
2820                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2821                 unlink(oldname);
2822                 rename(filename, oldname);
2823         }
2824
2825         trace->output = fopen(filename, "w");
2826
2827         return trace->output == NULL ? -errno : 0;
2828 }
2829
2830 static int parse_pagefaults(const struct option *opt, const char *str,
2831                             int unset __maybe_unused)
2832 {
2833         int *trace_pgfaults = opt->value;
2834
2835         if (strcmp(str, "all") == 0)
2836                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2837         else if (strcmp(str, "maj") == 0)
2838                 *trace_pgfaults |= TRACE_PFMAJ;
2839         else if (strcmp(str, "min") == 0)
2840                 *trace_pgfaults |= TRACE_PFMIN;
2841         else
2842                 return -1;
2843
2844         return 0;
2845 }
2846
2847 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2848 {
2849         struct perf_evsel *evsel;
2850
2851         evlist__for_each_entry(evlist, evsel)
2852                 evsel->handler = handler;
2853 }
2854
2855 /*
2856  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2857  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2858  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2859  *
2860  * It'd be better to introduce a parse_options() variant that would return a
2861  * list with the terms it didn't match to an event...
2862  */
2863 static int trace__parse_events_option(const struct option *opt, const char *str,
2864                                       int unset __maybe_unused)
2865 {
2866         struct trace *trace = (struct trace *)opt->value;
2867         const char *s = str;
2868         char *sep = NULL, *lists[2] = { NULL, NULL, };
2869         int len = strlen(str) + 1, err = -1, list, idx;
2870         char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2871         char group_name[PATH_MAX];
2872
2873         if (strace_groups_dir == NULL)
2874                 return -1;
2875
2876         if (*s == '!') {
2877                 ++s;
2878                 trace->not_ev_qualifier = true;
2879         }
2880
2881         while (1) {
2882                 if ((sep = strchr(s, ',')) != NULL)
2883                         *sep = '\0';
2884
2885                 list = 0;
2886                 if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2887                     syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2888                         list = 1;
2889                 } else {
2890                         path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2891                         if (access(group_name, R_OK) == 0)
2892                                 list = 1;
2893                 }
2894
2895                 if (lists[list]) {
2896                         sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2897                 } else {
2898                         lists[list] = malloc(len);
2899                         if (lists[list] == NULL)
2900                                 goto out;
2901                         strcpy(lists[list], s);
2902                 }
2903
2904                 if (!sep)
2905                         break;
2906
2907                 *sep = ',';
2908                 s = sep + 1;
2909         }
2910
2911         if (lists[1] != NULL) {
2912                 struct strlist_config slist_config = {
2913                         .dirname = strace_groups_dir,
2914                 };
2915
2916                 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2917                 if (trace->ev_qualifier == NULL) {
2918                         fputs("Not enough memory to parse event qualifier", trace->output);
2919                         goto out;
2920                 }
2921
2922                 if (trace__validate_ev_qualifier(trace))
2923                         goto out;
2924         }
2925
2926         err = 0;
2927
2928         if (lists[0]) {
2929                 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2930                                                "event selector. use 'perf list' to list available events",
2931                                                parse_events_option);
2932                 err = parse_events_option(&o, lists[0], 0);
2933         }
2934 out:
2935         if (sep)
2936                 *sep = ',';
2937
2938         return err;
2939 }
2940
2941 int cmd_trace(int argc, const char **argv)
2942 {
2943         const char *trace_usage[] = {
2944                 "perf trace [<options>] [<command>]",
2945                 "perf trace [<options>] -- <command> [<options>]",
2946                 "perf trace record [<options>] [<command>]",
2947                 "perf trace record [<options>] -- <command> [<options>]",
2948                 NULL
2949         };
2950         struct trace trace = {
2951                 .syscalls = {
2952                         . max = -1,
2953                 },
2954                 .opts = {
2955                         .target = {
2956                                 .uid       = UINT_MAX,
2957                                 .uses_mmap = true,
2958                         },
2959                         .user_freq     = UINT_MAX,
2960                         .user_interval = ULLONG_MAX,
2961                         .no_buffering  = true,
2962                         .mmap_pages    = UINT_MAX,
2963                         .proc_map_timeout  = 500,
2964                 },
2965                 .output = stderr,
2966                 .show_comm = true,
2967                 .trace_syscalls = true,
2968                 .kernel_syscallchains = false,
2969                 .max_stack = UINT_MAX,
2970         };
2971         const char *output_name = NULL;
2972         const struct option trace_options[] = {
2973         OPT_CALLBACK('e', "event", &trace, "event",
2974                      "event/syscall selector. use 'perf list' to list available events",
2975                      trace__parse_events_option),
2976         OPT_BOOLEAN(0, "comm", &trace.show_comm,
2977                     "show the thread COMM next to its id"),
2978         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2979         OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
2980                      trace__parse_events_option),
2981         OPT_STRING('o', "output", &output_name, "file", "output file name"),
2982         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2983         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2984                     "trace events on existing process id"),
2985         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2986                     "trace events on existing thread id"),
2987         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2988                      "pids to filter (by the kernel)", trace__set_filter_pids),
2989         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2990                     "system-wide collection from all CPUs"),
2991         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2992                     "list of cpus to monitor"),
2993         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2994                     "child tasks do not inherit counters"),
2995         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2996                      "number of mmap data pages",
2997                      perf_evlist__parse_mmap_pages),
2998         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2999                    "user to profile"),
3000         OPT_CALLBACK(0, "duration", &trace, "float",
3001                      "show only events with duration > N.M ms",
3002                      trace__set_duration),
3003         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3004         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3005         OPT_BOOLEAN('T', "time", &trace.full_time,
3006                     "Show full timestamp, not time relative to first start"),
3007         OPT_BOOLEAN('s', "summary", &trace.summary_only,
3008                     "Show only syscall summary with statistics"),
3009         OPT_BOOLEAN('S', "with-summary", &trace.summary,
3010                     "Show all syscalls and summary with statistics"),
3011         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3012                      "Trace pagefaults", parse_pagefaults, "maj"),
3013         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3014         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3015         OPT_CALLBACK(0, "call-graph", &trace.opts,
3016                      "record_mode[,record_size]", record_callchain_help,
3017                      &record_parse_callchain_opt),
3018         OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3019                     "Show the kernel callchains on the syscall exit path"),
3020         OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3021                      "Set the minimum stack depth when parsing the callchain, "
3022                      "anything below the specified depth will be ignored."),
3023         OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3024                      "Set the maximum stack depth when parsing the callchain, "
3025                      "anything beyond the specified depth will be ignored. "
3026                      "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3027         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3028                         "per thread proc mmap processing timeout in ms"),
3029         OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3030                      "ms to wait before starting measurement after program "
3031                      "start"),
3032         OPT_END()
3033         };
3034         bool __maybe_unused max_stack_user_set = true;
3035         bool mmap_pages_user_set = true;
3036         const char * const trace_subcommands[] = { "record", NULL };
3037         int err;
3038         char bf[BUFSIZ];
3039
3040         signal(SIGSEGV, sighandler_dump_stack);
3041         signal(SIGFPE, sighandler_dump_stack);
3042
3043         trace.evlist = perf_evlist__new();
3044         trace.sctbl = syscalltbl__new();
3045
3046         if (trace.evlist == NULL || trace.sctbl == NULL) {
3047                 pr_err("Not enough memory to run!\n");
3048                 err = -ENOMEM;
3049                 goto out;
3050         }
3051
3052         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3053                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3054
3055         err = bpf__setup_stdout(trace.evlist);
3056         if (err) {
3057                 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3058                 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3059                 goto out;
3060         }
3061
3062         err = -1;
3063
3064         if (trace.trace_pgfaults) {
3065                 trace.opts.sample_address = true;
3066                 trace.opts.sample_time = true;
3067         }
3068
3069         if (trace.opts.mmap_pages == UINT_MAX)
3070                 mmap_pages_user_set = false;
3071
3072         if (trace.max_stack == UINT_MAX) {
3073                 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
3074                 max_stack_user_set = false;
3075         }
3076
3077 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3078         if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
3079                 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3080 #endif
3081
3082         if (callchain_param.enabled) {
3083                 if (!mmap_pages_user_set && geteuid() == 0)
3084                         trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3085
3086                 symbol_conf.use_callchain = true;
3087         }
3088
3089         if (trace.evlist->nr_entries > 0)
3090                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3091
3092         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3093                 return trace__record(&trace, argc-1, &argv[1]);
3094
3095         /* summary_only implies summary option, but don't overwrite summary if set */
3096         if (trace.summary_only)
3097                 trace.summary = trace.summary_only;
3098
3099         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3100             trace.evlist->nr_entries == 0 /* Was --events used? */) {
3101                 pr_err("Please specify something to trace.\n");
3102                 return -1;
3103         }
3104
3105         if (!trace.trace_syscalls && trace.ev_qualifier) {
3106                 pr_err("The -e option can't be used with --no-syscalls.\n");
3107                 goto out;
3108         }
3109
3110         if (output_name != NULL) {
3111                 err = trace__open_output(&trace, output_name);
3112                 if (err < 0) {
3113                         perror("failed to create output file");
3114                         goto out;
3115                 }
3116         }
3117
3118         trace.open_id = syscalltbl__id(trace.sctbl, "open");
3119
3120         err = target__validate(&trace.opts.target);
3121         if (err) {
3122                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3123                 fprintf(trace.output, "%s", bf);
3124                 goto out_close;
3125         }
3126
3127         err = target__parse_uid(&trace.opts.target);
3128         if (err) {
3129                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3130                 fprintf(trace.output, "%s", bf);
3131                 goto out_close;
3132         }
3133
3134         if (!argc && target__none(&trace.opts.target))
3135                 trace.opts.target.system_wide = true;
3136
3137         if (input_name)
3138                 err = trace__replay(&trace);
3139         else
3140                 err = trace__run(&trace, argc, argv);
3141
3142 out_close:
3143         if (output_name != NULL)
3144                 fclose(trace.output);
3145 out:
3146         return err;
3147 }