4 * Builtin 'trace' command:
6 * Display a continuously updated trace of any workload, CPU, specific PID,
7 * system wide, etc. Default format is loosely strace like, but any other
8 * event may be specified using --event.
10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
12 * Initially based on the 'trace' prototype by Thomas Gleixner:
14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
16 * Released under the GPL v2. (and only v2, not any later version)
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace-event.h"
35 #include "util/parse-events.h"
36 #include "util/bpf-loader.h"
37 #include "callchain.h"
38 #include "syscalltbl.h"
39 #include "rb_resort.h"
41 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
43 #include <linux/err.h>
44 #include <linux/filter.h>
45 #include <linux/audit.h>
46 #include <linux/random.h>
47 #include <linux/stringify.h>
48 #include <linux/time64.h>
51 # define O_CLOEXEC 02000000
55 struct perf_tool tool;
56 struct syscalltbl *sctbl;
59 struct syscall *table;
61 struct perf_evsel *sys_enter,
65 struct record_opts opts;
66 struct perf_evlist *evlist;
68 struct thread *current;
71 unsigned long nr_events;
72 struct strlist *ev_qualifier;
77 struct intlist *tid_list;
78 struct intlist *pid_list;
83 double duration_filter;
89 unsigned int max_stack;
90 unsigned int min_stack;
91 bool not_ev_qualifier;
95 bool multiple_threads;
101 bool kernel_syscallchains;
111 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
112 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
116 #define TP_UINT_FIELD(bits) \
117 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
120 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
129 #define TP_UINT_FIELD__SWAPPED(bits) \
130 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
133 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
134 return bswap_##bits(value);\
137 TP_UINT_FIELD__SWAPPED(16);
138 TP_UINT_FIELD__SWAPPED(32);
139 TP_UINT_FIELD__SWAPPED(64);
141 static int tp_field__init_uint(struct tp_field *field,
142 struct format_field *format_field,
145 field->offset = format_field->offset;
147 switch (format_field->size) {
149 field->integer = tp_field__u8;
152 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
155 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
158 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
167 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
169 return sample->raw_data + field->offset;
172 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
174 field->offset = format_field->offset;
175 field->pointer = tp_field__ptr;
182 struct tp_field args, ret;
186 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
187 struct tp_field *field,
190 struct format_field *format_field = perf_evsel__field(evsel, name);
192 if (format_field == NULL)
195 return tp_field__init_uint(field, format_field, evsel->needs_swap);
198 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
199 ({ struct syscall_tp *sc = evsel->priv;\
200 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
202 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
203 struct tp_field *field,
206 struct format_field *format_field = perf_evsel__field(evsel, name);
208 if (format_field == NULL)
211 return tp_field__init_ptr(field, format_field);
214 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
215 ({ struct syscall_tp *sc = evsel->priv;\
216 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
218 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
221 perf_evsel__delete(evsel);
224 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
226 evsel->priv = malloc(sizeof(struct syscall_tp));
227 if (evsel->priv != NULL) {
228 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
231 evsel->handler = handler;
242 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
244 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
246 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
248 evsel = perf_evsel__newtp("syscalls", direction);
253 if (perf_evsel__init_syscall_tp(evsel, handler))
259 perf_evsel__delete_priv(evsel);
263 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
264 ({ struct syscall_tp *fields = evsel->priv; \
265 fields->name.integer(&fields->name, sample); })
267 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
268 ({ struct syscall_tp *fields = evsel->priv; \
269 fields->name.pointer(&fields->name, sample); })
273 struct thread *thread;
283 const char **entries;
286 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
287 .nr_entries = ARRAY_SIZE(array), \
291 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
293 .nr_entries = ARRAY_SIZE(array), \
297 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
299 struct syscall_arg *arg)
301 struct strarray *sa = arg->parm;
302 int idx = arg->val - sa->offset;
304 if (idx < 0 || idx >= sa->nr_entries)
305 return scnprintf(bf, size, intfmt, arg->val);
307 return scnprintf(bf, size, "%s", sa->entries[idx]);
310 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
311 struct syscall_arg *arg)
313 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
316 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
318 #if defined(__i386__) || defined(__x86_64__)
320 * FIXME: Make this available to all arches as soon as the ioctl beautifier
321 * gets rewritten to support all arches.
323 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
324 struct syscall_arg *arg)
326 return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
329 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
330 #endif /* defined(__i386__) || defined(__x86_64__) */
332 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
333 struct syscall_arg *arg);
335 #define SCA_FD syscall_arg__scnprintf_fd
338 #define AT_FDCWD -100
341 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
342 struct syscall_arg *arg)
347 return scnprintf(bf, size, "CWD");
349 return syscall_arg__scnprintf_fd(bf, size, arg);
352 #define SCA_FDAT syscall_arg__scnprintf_fd_at
354 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
355 struct syscall_arg *arg);
357 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
359 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
360 struct syscall_arg *arg)
362 return scnprintf(bf, size, "%#lx", arg->val);
365 #define SCA_HEX syscall_arg__scnprintf_hex
367 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
368 struct syscall_arg *arg)
370 return scnprintf(bf, size, "%d", arg->val);
373 #define SCA_INT syscall_arg__scnprintf_int
375 static const char *bpf_cmd[] = {
376 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
377 "MAP_GET_NEXT_KEY", "PROG_LOAD",
379 static DEFINE_STRARRAY(bpf_cmd);
381 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
382 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
384 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
385 static DEFINE_STRARRAY(itimers);
387 static const char *keyctl_options[] = {
388 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
389 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
390 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
391 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
392 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
394 static DEFINE_STRARRAY(keyctl_options);
396 static const char *whences[] = { "SET", "CUR", "END",
404 static DEFINE_STRARRAY(whences);
406 static const char *fcntl_cmds[] = {
407 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
408 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
409 "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
412 static DEFINE_STRARRAY(fcntl_cmds);
414 static const char *rlimit_resources[] = {
415 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
416 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
419 static DEFINE_STRARRAY(rlimit_resources);
421 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
422 static DEFINE_STRARRAY(sighow);
424 static const char *clockid[] = {
425 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
426 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
427 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
429 static DEFINE_STRARRAY(clockid);
431 static const char *socket_families[] = {
432 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
433 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
434 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
435 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
436 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
437 "ALG", "NFC", "VSOCK",
439 static DEFINE_STRARRAY(socket_families);
441 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
442 struct syscall_arg *arg)
447 if (mode == F_OK) /* 0 */
448 return scnprintf(bf, size, "F");
450 if (mode & n##_OK) { \
451 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
461 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
466 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
468 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
469 struct syscall_arg *arg);
471 #define SCA_FILENAME syscall_arg__scnprintf_filename
473 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
474 struct syscall_arg *arg)
476 int printed = 0, flags = arg->val;
479 if (flags & O_##n) { \
480 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
489 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
494 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
496 #if defined(__i386__) || defined(__x86_64__)
498 * FIXME: Make this available to all arches.
500 #define TCGETS 0x5401
502 static const char *tioctls[] = {
503 "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
504 "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
505 "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
506 "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
507 "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
508 "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
509 "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
510 "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
511 "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
512 "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
513 "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
514 [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
515 "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
516 "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
517 "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
520 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
521 #endif /* defined(__i386__) || defined(__x86_64__) */
523 #ifndef GRND_NONBLOCK
524 #define GRND_NONBLOCK 0x0001
527 #define GRND_RANDOM 0x0002
530 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
531 struct syscall_arg *arg)
533 int printed = 0, flags = arg->val;
536 if (flags & GRND_##n) { \
537 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
538 flags &= ~GRND_##n; \
546 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
551 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
553 #define STRARRAY(arg, name, array) \
554 .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
555 .arg_parm = { [arg] = &strarray__##array, }
557 #include "trace/beauty/eventfd.c"
558 #include "trace/beauty/flock.c"
559 #include "trace/beauty/futex_op.c"
560 #include "trace/beauty/mmap.c"
561 #include "trace/beauty/mode_t.c"
562 #include "trace/beauty/msg_flags.c"
563 #include "trace/beauty/open_flags.c"
564 #include "trace/beauty/perf_event_open.c"
565 #include "trace/beauty/pid.c"
566 #include "trace/beauty/sched_policy.c"
567 #include "trace/beauty/seccomp.c"
568 #include "trace/beauty/signum.c"
569 #include "trace/beauty/socket_type.c"
570 #include "trace/beauty/waitid_options.c"
572 static struct syscall_fmt {
575 size_t (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
582 { .name = "access", .errmsg = true,
583 .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, },
584 { .name = "arch_prctl", .errmsg = true, .alias = "prctl", },
585 { .name = "bpf", .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
586 { .name = "brk", .hexret = true,
587 .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
588 { .name = "chdir", .errmsg = true, },
589 { .name = "chmod", .errmsg = true, },
590 { .name = "chroot", .errmsg = true, },
591 { .name = "clock_gettime", .errmsg = true, STRARRAY(0, clk_id, clockid), },
592 { .name = "clone", .errpid = true, },
593 { .name = "close", .errmsg = true,
594 .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
595 { .name = "connect", .errmsg = true, },
596 { .name = "creat", .errmsg = true, },
597 { .name = "dup", .errmsg = true, },
598 { .name = "dup2", .errmsg = true, },
599 { .name = "dup3", .errmsg = true, },
600 { .name = "epoll_ctl", .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
601 { .name = "eventfd2", .errmsg = true,
602 .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
603 { .name = "faccessat", .errmsg = true, },
604 { .name = "fadvise64", .errmsg = true, },
605 { .name = "fallocate", .errmsg = true, },
606 { .name = "fchdir", .errmsg = true, },
607 { .name = "fchmod", .errmsg = true, },
608 { .name = "fchmodat", .errmsg = true,
609 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
610 { .name = "fchown", .errmsg = true, },
611 { .name = "fchownat", .errmsg = true,
612 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
613 { .name = "fcntl", .errmsg = true,
614 .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
615 .arg_parm = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
616 { .name = "fdatasync", .errmsg = true, },
617 { .name = "flock", .errmsg = true,
618 .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
619 { .name = "fsetxattr", .errmsg = true, },
620 { .name = "fstat", .errmsg = true, .alias = "newfstat", },
621 { .name = "fstatat", .errmsg = true, .alias = "newfstatat", },
622 { .name = "fstatfs", .errmsg = true, },
623 { .name = "fsync", .errmsg = true, },
624 { .name = "ftruncate", .errmsg = true, },
625 { .name = "futex", .errmsg = true,
626 .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
627 { .name = "futimesat", .errmsg = true,
628 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
629 { .name = "getdents", .errmsg = true, },
630 { .name = "getdents64", .errmsg = true, },
631 { .name = "getitimer", .errmsg = true, STRARRAY(0, which, itimers), },
632 { .name = "getpid", .errpid = true, },
633 { .name = "getpgid", .errpid = true, },
634 { .name = "getppid", .errpid = true, },
635 { .name = "getrandom", .errmsg = true,
636 .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
637 { .name = "getrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
638 { .name = "getxattr", .errmsg = true, },
639 { .name = "inotify_add_watch", .errmsg = true, },
640 { .name = "ioctl", .errmsg = true,
642 #if defined(__i386__) || defined(__x86_64__)
644 * FIXME: Make this available to all arches.
646 [1] = SCA_STRHEXARRAY, /* cmd */
647 [2] = SCA_HEX, /* arg */ },
648 .arg_parm = { [1] = &strarray__tioctls, /* cmd */ }, },
650 [2] = SCA_HEX, /* arg */ }, },
652 { .name = "keyctl", .errmsg = true, STRARRAY(0, option, keyctl_options), },
653 { .name = "kill", .errmsg = true,
654 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
655 { .name = "lchown", .errmsg = true, },
656 { .name = "lgetxattr", .errmsg = true, },
657 { .name = "linkat", .errmsg = true,
658 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
659 { .name = "listxattr", .errmsg = true, },
660 { .name = "llistxattr", .errmsg = true, },
661 { .name = "lremovexattr", .errmsg = true, },
662 { .name = "lseek", .errmsg = true,
663 .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
664 .arg_parm = { [2] = &strarray__whences, /* whence */ }, },
665 { .name = "lsetxattr", .errmsg = true, },
666 { .name = "lstat", .errmsg = true, .alias = "newlstat", },
667 { .name = "lsxattr", .errmsg = true, },
668 { .name = "madvise", .errmsg = true,
669 .arg_scnprintf = { [0] = SCA_HEX, /* start */
670 [2] = SCA_MADV_BHV, /* behavior */ }, },
671 { .name = "mkdir", .errmsg = true, },
672 { .name = "mkdirat", .errmsg = true,
673 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
674 { .name = "mknod", .errmsg = true, },
675 { .name = "mknodat", .errmsg = true,
676 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
677 { .name = "mlock", .errmsg = true,
678 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
679 { .name = "mlockall", .errmsg = true,
680 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
681 { .name = "mmap", .hexret = true,
682 /* The standard mmap maps to old_mmap on s390x */
683 #if defined(__s390x__)
686 .arg_scnprintf = { [0] = SCA_HEX, /* addr */
687 [2] = SCA_MMAP_PROT, /* prot */
688 [3] = SCA_MMAP_FLAGS, /* flags */ }, },
689 { .name = "mprotect", .errmsg = true,
690 .arg_scnprintf = { [0] = SCA_HEX, /* start */
691 [2] = SCA_MMAP_PROT, /* prot */ }, },
692 { .name = "mq_unlink", .errmsg = true,
693 .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
694 { .name = "mremap", .hexret = true,
695 .arg_scnprintf = { [0] = SCA_HEX, /* addr */
696 [3] = SCA_MREMAP_FLAGS, /* flags */
697 [4] = SCA_HEX, /* new_addr */ }, },
698 { .name = "munlock", .errmsg = true,
699 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
700 { .name = "munmap", .errmsg = true,
701 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
702 { .name = "name_to_handle_at", .errmsg = true,
703 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
704 { .name = "newfstatat", .errmsg = true,
705 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
706 { .name = "open", .errmsg = true,
707 .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
708 { .name = "open_by_handle_at", .errmsg = true,
709 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
710 [2] = SCA_OPEN_FLAGS, /* flags */ }, },
711 { .name = "openat", .errmsg = true,
712 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
713 [2] = SCA_OPEN_FLAGS, /* flags */ }, },
714 { .name = "perf_event_open", .errmsg = true,
715 .arg_scnprintf = { [2] = SCA_INT, /* cpu */
716 [3] = SCA_FD, /* group_fd */
717 [4] = SCA_PERF_FLAGS, /* flags */ }, },
718 { .name = "pipe2", .errmsg = true,
719 .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
720 { .name = "poll", .errmsg = true, .timeout = true, },
721 { .name = "ppoll", .errmsg = true, .timeout = true, },
722 { .name = "pread", .errmsg = true, .alias = "pread64", },
723 { .name = "preadv", .errmsg = true, .alias = "pread", },
724 { .name = "prlimit64", .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
725 { .name = "pwrite", .errmsg = true, .alias = "pwrite64", },
726 { .name = "pwritev", .errmsg = true, },
727 { .name = "read", .errmsg = true, },
728 { .name = "readlink", .errmsg = true, },
729 { .name = "readlinkat", .errmsg = true,
730 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
731 { .name = "readv", .errmsg = true, },
732 { .name = "recvfrom", .errmsg = true,
733 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
734 { .name = "recvmmsg", .errmsg = true,
735 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
736 { .name = "recvmsg", .errmsg = true,
737 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
738 { .name = "removexattr", .errmsg = true, },
739 { .name = "renameat", .errmsg = true,
740 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
741 { .name = "rmdir", .errmsg = true, },
742 { .name = "rt_sigaction", .errmsg = true,
743 .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
744 { .name = "rt_sigprocmask", .errmsg = true, STRARRAY(0, how, sighow), },
745 { .name = "rt_sigqueueinfo", .errmsg = true,
746 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
747 { .name = "rt_tgsigqueueinfo", .errmsg = true,
748 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
749 { .name = "sched_getattr", .errmsg = true, },
750 { .name = "sched_setattr", .errmsg = true, },
751 { .name = "sched_setscheduler", .errmsg = true,
752 .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
753 { .name = "seccomp", .errmsg = true,
754 .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
755 [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
756 { .name = "select", .errmsg = true, .timeout = true, },
757 { .name = "sendmmsg", .errmsg = true,
758 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
759 { .name = "sendmsg", .errmsg = true,
760 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
761 { .name = "sendto", .errmsg = true,
762 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
763 { .name = "set_tid_address", .errpid = true, },
764 { .name = "setitimer", .errmsg = true, STRARRAY(0, which, itimers), },
765 { .name = "setpgid", .errmsg = true, },
766 { .name = "setrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
767 { .name = "setxattr", .errmsg = true, },
768 { .name = "shutdown", .errmsg = true, },
769 { .name = "socket", .errmsg = true,
770 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
771 [1] = SCA_SK_TYPE, /* type */ },
772 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, },
773 { .name = "socketpair", .errmsg = true,
774 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
775 [1] = SCA_SK_TYPE, /* type */ },
776 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, },
777 { .name = "stat", .errmsg = true, .alias = "newstat", },
778 { .name = "statfs", .errmsg = true, },
779 { .name = "swapoff", .errmsg = true,
780 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
781 { .name = "swapon", .errmsg = true,
782 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
783 { .name = "symlinkat", .errmsg = true,
784 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
785 { .name = "tgkill", .errmsg = true,
786 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
787 { .name = "tkill", .errmsg = true,
788 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
789 { .name = "truncate", .errmsg = true, },
790 { .name = "uname", .errmsg = true, .alias = "newuname", },
791 { .name = "unlinkat", .errmsg = true,
792 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
793 { .name = "utime", .errmsg = true, },
794 { .name = "utimensat", .errmsg = true,
795 .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
796 { .name = "utimes", .errmsg = true, },
797 { .name = "vmsplice", .errmsg = true, },
798 { .name = "wait4", .errpid = true,
799 .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
800 { .name = "waitid", .errpid = true,
801 .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
802 { .name = "write", .errmsg = true, },
803 { .name = "writev", .errmsg = true, },
806 static int syscall_fmt__cmp(const void *name, const void *fmtp)
808 const struct syscall_fmt *fmt = fmtp;
809 return strcmp(name, fmt->name);
812 static struct syscall_fmt *syscall_fmt__find(const char *name)
814 const int nmemb = ARRAY_SIZE(syscall_fmts);
815 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
819 struct event_format *tp_format;
821 struct format_field *args;
824 struct syscall_fmt *fmt;
825 size_t (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
830 * We need to have this 'calculated' boolean because in some cases we really
831 * don't know what is the duration of a syscall, for instance, when we start
832 * a session and some threads are waiting for a syscall to finish, say 'poll',
833 * in which case all we can do is to print "( ? ) for duration and for the
836 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
838 double duration = (double)t / NSEC_PER_MSEC;
839 size_t printed = fprintf(fp, "(");
842 printed += fprintf(fp, " ? ");
843 else if (duration >= 1.0)
844 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
845 else if (duration >= 0.01)
846 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
848 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
849 return printed + fprintf(fp, "): ");
853 * filename.ptr: The filename char pointer that will be vfs_getname'd
854 * filename.entry_str_pos: Where to insert the string translated from
855 * filename.ptr by the vfs_getname tracepoint/kprobe.
857 struct thread_trace {
861 unsigned long nr_events;
862 unsigned long pfmaj, pfmin;
867 short int entry_str_pos;
869 unsigned int namelen;
877 struct intlist *syscall_stats;
880 static struct thread_trace *thread_trace__new(void)
882 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
885 ttrace->paths.max = -1;
887 ttrace->syscall_stats = intlist__new(NULL);
892 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
894 struct thread_trace *ttrace;
899 if (thread__priv(thread) == NULL)
900 thread__set_priv(thread, thread_trace__new());
902 if (thread__priv(thread) == NULL)
905 ttrace = thread__priv(thread);
910 color_fprintf(fp, PERF_COLOR_RED,
911 "WARNING: not enough memory, dropping samples!\n");
915 #define TRACE_PFMAJ (1 << 0)
916 #define TRACE_PFMIN (1 << 1)
918 static const size_t trace__entry_str_size = 2048;
920 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
922 struct thread_trace *ttrace = thread__priv(thread);
924 if (fd > ttrace->paths.max) {
925 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
930 if (ttrace->paths.max != -1) {
931 memset(npath + ttrace->paths.max + 1, 0,
932 (fd - ttrace->paths.max) * sizeof(char *));
934 memset(npath, 0, (fd + 1) * sizeof(char *));
937 ttrace->paths.table = npath;
938 ttrace->paths.max = fd;
941 ttrace->paths.table[fd] = strdup(pathname);
943 return ttrace->paths.table[fd] != NULL ? 0 : -1;
946 static int thread__read_fd_path(struct thread *thread, int fd)
948 char linkname[PATH_MAX], pathname[PATH_MAX];
952 if (thread->pid_ == thread->tid) {
953 scnprintf(linkname, sizeof(linkname),
954 "/proc/%d/fd/%d", thread->pid_, fd);
956 scnprintf(linkname, sizeof(linkname),
957 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
960 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
963 ret = readlink(linkname, pathname, sizeof(pathname));
965 if (ret < 0 || ret > st.st_size)
968 pathname[ret] = '\0';
969 return trace__set_fd_pathname(thread, fd, pathname);
972 static const char *thread__fd_path(struct thread *thread, int fd,
975 struct thread_trace *ttrace = thread__priv(thread);
983 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
986 ++trace->stats.proc_getname;
987 if (thread__read_fd_path(thread, fd))
991 return ttrace->paths.table[fd];
994 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
995 struct syscall_arg *arg)
998 size_t printed = scnprintf(bf, size, "%d", fd);
999 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1002 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1007 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1008 struct syscall_arg *arg)
1011 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1012 struct thread_trace *ttrace = thread__priv(arg->thread);
1014 if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1015 zfree(&ttrace->paths.table[fd]);
1020 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1023 struct thread_trace *ttrace = thread__priv(thread);
1025 ttrace->filename.ptr = ptr;
1026 ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1029 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1030 struct syscall_arg *arg)
1032 unsigned long ptr = arg->val;
1034 if (!arg->trace->vfs_getname)
1035 return scnprintf(bf, size, "%#x", ptr);
1037 thread__set_filename_pos(arg->thread, bf, ptr);
1041 static bool trace__filter_duration(struct trace *trace, double t)
1043 return t < (trace->duration_filter * NSEC_PER_MSEC);
1046 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1048 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1050 return fprintf(fp, "%10.3f ", ts);
1054 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1055 * using ttrace->entry_time for a thread that receives a sys_exit without
1056 * first having received a sys_enter ("poll" issued before tracing session
1057 * starts, lost sys_enter exit due to ring buffer overflow).
1059 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1062 return __trace__fprintf_tstamp(trace, tstamp, fp);
1064 return fprintf(fp, " ? ");
1067 static bool done = false;
1068 static bool interrupted = false;
1070 static void sig_handler(int sig)
1073 interrupted = sig == SIGINT;
1076 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1077 u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1079 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1080 printed += fprintf_duration(duration, duration_calculated, fp);
1082 if (trace->multiple_threads) {
1083 if (trace->show_comm)
1084 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1085 printed += fprintf(fp, "%d ", thread->tid);
1091 static int trace__process_event(struct trace *trace, struct machine *machine,
1092 union perf_event *event, struct perf_sample *sample)
1096 switch (event->header.type) {
1097 case PERF_RECORD_LOST:
1098 color_fprintf(trace->output, PERF_COLOR_RED,
1099 "LOST %" PRIu64 " events!\n", event->lost.lost);
1100 ret = machine__process_lost_event(machine, event, sample);
1103 ret = machine__process_event(machine, event, sample);
1110 static int trace__tool_process(struct perf_tool *tool,
1111 union perf_event *event,
1112 struct perf_sample *sample,
1113 struct machine *machine)
1115 struct trace *trace = container_of(tool, struct trace, tool);
1116 return trace__process_event(trace, machine, event, sample);
1119 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1121 struct machine *machine = vmachine;
1123 if (machine->kptr_restrict_warned)
1126 if (symbol_conf.kptr_restrict) {
1127 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1128 "Check /proc/sys/kernel/kptr_restrict.\n\n"
1129 "Kernel samples will not be resolved.\n");
1130 machine->kptr_restrict_warned = true;
1134 return machine__resolve_kernel_addr(vmachine, addrp, modp);
1137 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1139 int err = symbol__init(NULL);
1144 trace->host = machine__new_host();
1145 if (trace->host == NULL)
1148 if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1151 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1152 evlist->threads, trace__tool_process, false,
1153 trace->opts.proc_map_timeout);
1160 static int syscall__set_arg_fmts(struct syscall *sc)
1162 struct format_field *field;
1165 sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1166 if (sc->arg_scnprintf == NULL)
1170 sc->arg_parm = sc->fmt->arg_parm;
1172 for (field = sc->args; field; field = field->next) {
1173 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1174 sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1175 else if (strcmp(field->type, "const char *") == 0 &&
1176 (strcmp(field->name, "filename") == 0 ||
1177 strcmp(field->name, "path") == 0 ||
1178 strcmp(field->name, "pathname") == 0))
1179 sc->arg_scnprintf[idx] = SCA_FILENAME;
1180 else if (field->flags & FIELD_IS_POINTER)
1181 sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1182 else if (strcmp(field->type, "pid_t") == 0)
1183 sc->arg_scnprintf[idx] = SCA_PID;
1184 else if (strcmp(field->type, "umode_t") == 0)
1185 sc->arg_scnprintf[idx] = SCA_MODE_T;
1186 else if ((strcmp(field->type, "int") == 0 ||
1187 strcmp(field->type, "unsigned int") == 0 ||
1188 strcmp(field->type, "long") == 0) &&
1189 (len = strlen(field->name)) >= 2 &&
1190 strcmp(field->name + len - 2, "fd") == 0) {
1192 * /sys/kernel/tracing/events/syscalls/sys_enter*
1193 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1198 sc->arg_scnprintf[idx] = SCA_FD;
1206 static int trace__read_syscall_info(struct trace *trace, int id)
1210 const char *name = syscalltbl__name(trace->sctbl, id);
1215 if (id > trace->syscalls.max) {
1216 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1218 if (nsyscalls == NULL)
1221 if (trace->syscalls.max != -1) {
1222 memset(nsyscalls + trace->syscalls.max + 1, 0,
1223 (id - trace->syscalls.max) * sizeof(*sc));
1225 memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1228 trace->syscalls.table = nsyscalls;
1229 trace->syscalls.max = id;
1232 sc = trace->syscalls.table + id;
1235 sc->fmt = syscall_fmt__find(sc->name);
1237 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1238 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1240 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1241 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1242 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1245 if (IS_ERR(sc->tp_format))
1248 sc->args = sc->tp_format->format.fields;
1249 sc->nr_args = sc->tp_format->format.nr_fields;
1251 * We need to check and discard the first variable '__syscall_nr'
1252 * or 'nr' that mean the syscall number. It is needless here.
1253 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1255 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1256 sc->args = sc->args->next;
1260 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1262 return syscall__set_arg_fmts(sc);
1265 static int trace__validate_ev_qualifier(struct trace *trace)
1268 struct str_node *pos;
1270 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1271 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1272 sizeof(trace->ev_qualifier_ids.entries[0]));
1274 if (trace->ev_qualifier_ids.entries == NULL) {
1275 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1283 strlist__for_each_entry(pos, trace->ev_qualifier) {
1284 const char *sc = pos->s;
1285 int id = syscalltbl__id(trace->sctbl, sc);
1289 fputs("Error:\tInvalid syscall ", trace->output);
1292 fputs(", ", trace->output);
1295 fputs(sc, trace->output);
1298 trace->ev_qualifier_ids.entries[i++] = id;
1302 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1303 "\nHint:\tand: 'man syscalls'\n", trace->output);
1304 zfree(&trace->ev_qualifier_ids.entries);
1305 trace->ev_qualifier_ids.nr = 0;
1312 * args is to be interpreted as a series of longs but we need to handle
1313 * 8-byte unaligned accesses. args points to raw_data within the event
1314 * and raw_data is guaranteed to be 8-byte unaligned because it is
1315 * preceded by raw_size which is a u32. So we need to copy args to a temp
1316 * variable to read it. Most notably this avoids extended load instructions
1317 * on unaligned addresses
1320 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1321 unsigned char *args, struct trace *trace,
1322 struct thread *thread)
1328 if (sc->args != NULL) {
1329 struct format_field *field;
1331 struct syscall_arg arg = {
1338 for (field = sc->args; field;
1339 field = field->next, ++arg.idx, bit <<= 1) {
1343 /* special care for unaligned accesses */
1344 p = args + sizeof(unsigned long) * arg.idx;
1345 memcpy(&val, p, sizeof(val));
1348 * Suppress this argument if its value is zero and
1349 * and we don't have a string associated in an
1353 !(sc->arg_scnprintf &&
1354 sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1355 sc->arg_parm[arg.idx]))
1358 printed += scnprintf(bf + printed, size - printed,
1359 "%s%s: ", printed ? ", " : "", field->name);
1360 if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1363 arg.parm = sc->arg_parm[arg.idx];
1364 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1365 size - printed, &arg);
1367 printed += scnprintf(bf + printed, size - printed,
1371 } else if (IS_ERR(sc->tp_format)) {
1373 * If we managed to read the tracepoint /format file, then we
1374 * may end up not having any args, like with gettid(), so only
1375 * print the raw args when we didn't manage to read it.
1380 /* special care for unaligned accesses */
1381 p = args + sizeof(unsigned long) * i;
1382 memcpy(&val, p, sizeof(val));
1383 printed += scnprintf(bf + printed, size - printed,
1385 printed ? ", " : "", i, val);
1393 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1394 union perf_event *event,
1395 struct perf_sample *sample);
1397 static struct syscall *trace__syscall_info(struct trace *trace,
1398 struct perf_evsel *evsel, int id)
1404 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1405 * before that, leaving at a higher verbosity level till that is
1406 * explained. Reproduced with plain ftrace with:
1408 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1409 * grep "NR -1 " /t/trace_pipe
1411 * After generating some load on the machine.
1415 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1416 id, perf_evsel__name(evsel), ++n);
1421 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1422 trace__read_syscall_info(trace, id))
1425 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1428 return &trace->syscalls.table[id];
1432 fprintf(trace->output, "Problems reading syscall %d", id);
1433 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1434 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1435 fputs(" information\n", trace->output);
1440 static void thread__update_stats(struct thread_trace *ttrace,
1441 int id, struct perf_sample *sample)
1443 struct int_node *inode;
1444 struct stats *stats;
1447 inode = intlist__findnew(ttrace->syscall_stats, id);
1451 stats = inode->priv;
1452 if (stats == NULL) {
1453 stats = malloc(sizeof(struct stats));
1457 inode->priv = stats;
1460 if (ttrace->entry_time && sample->time > ttrace->entry_time)
1461 duration = sample->time - ttrace->entry_time;
1463 update_stats(stats, duration);
1466 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1468 struct thread_trace *ttrace;
1472 if (trace->current == NULL)
1475 ttrace = thread__priv(trace->current);
1477 if (!ttrace->entry_pending)
1480 duration = sample->time - ttrace->entry_time;
1482 printed = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1483 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1484 ttrace->entry_pending = false;
1489 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1490 union perf_event *event __maybe_unused,
1491 struct perf_sample *sample)
1496 struct thread *thread;
1497 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1498 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1499 struct thread_trace *ttrace;
1504 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1505 ttrace = thread__trace(thread, trace->output);
1509 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1511 if (ttrace->entry_str == NULL) {
1512 ttrace->entry_str = malloc(trace__entry_str_size);
1513 if (!ttrace->entry_str)
1517 if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1518 trace__printf_interrupted_entry(trace, sample);
1520 ttrace->entry_time = sample->time;
1521 msg = ttrace->entry_str;
1522 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1524 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1525 args, trace, thread);
1528 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1529 trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1530 fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1533 ttrace->entry_pending = true;
1534 /* See trace__vfs_getname & trace__sys_exit */
1535 ttrace->filename.pending_open = false;
1538 if (trace->current != thread) {
1539 thread__put(trace->current);
1540 trace->current = thread__get(thread);
1544 thread__put(thread);
1548 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1549 struct perf_sample *sample,
1550 struct callchain_cursor *cursor)
1552 struct addr_location al;
1554 if (machine__resolve(trace->host, &al, sample) < 0 ||
1555 thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1561 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1563 /* TODO: user-configurable print_opts */
1564 const unsigned int print_opts = EVSEL__PRINT_SYM |
1566 EVSEL__PRINT_UNKNOWN_AS_ADDR;
1568 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1571 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1572 union perf_event *event __maybe_unused,
1573 struct perf_sample *sample)
1577 bool duration_calculated = false;
1578 struct thread *thread;
1579 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1580 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1581 struct thread_trace *ttrace;
1586 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1587 ttrace = thread__trace(thread, trace->output);
1592 thread__update_stats(ttrace, id, sample);
1594 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1596 if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1597 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1598 ttrace->filename.pending_open = false;
1599 ++trace->stats.vfs_getname;
1602 ttrace->exit_time = sample->time;
1604 if (ttrace->entry_time) {
1605 duration = sample->time - ttrace->entry_time;
1606 if (trace__filter_duration(trace, duration))
1608 duration_calculated = true;
1609 } else if (trace->duration_filter)
1612 if (sample->callchain) {
1613 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1614 if (callchain_ret == 0) {
1615 if (callchain_cursor.nr < trace->min_stack)
1621 if (trace->summary_only)
1624 trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1626 if (ttrace->entry_pending) {
1627 fprintf(trace->output, "%-70s", ttrace->entry_str);
1629 fprintf(trace->output, " ... [");
1630 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1631 fprintf(trace->output, "]: %s()", sc->name);
1634 if (sc->fmt == NULL) {
1636 fprintf(trace->output, ") = %ld", ret);
1637 } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1638 char bf[STRERR_BUFSIZE];
1639 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1640 *e = audit_errno_to_name(-ret);
1642 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1643 } else if (ret == 0 && sc->fmt->timeout)
1644 fprintf(trace->output, ") = 0 Timeout");
1645 else if (sc->fmt->hexret)
1646 fprintf(trace->output, ") = %#lx", ret);
1647 else if (sc->fmt->errpid) {
1648 struct thread *child = machine__find_thread(trace->host, ret, ret);
1650 if (child != NULL) {
1651 fprintf(trace->output, ") = %ld", ret);
1652 if (child->comm_set)
1653 fprintf(trace->output, " (%s)", thread__comm_str(child));
1659 fputc('\n', trace->output);
1661 if (callchain_ret > 0)
1662 trace__fprintf_callchain(trace, sample);
1663 else if (callchain_ret < 0)
1664 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1666 ttrace->entry_pending = false;
1669 thread__put(thread);
1673 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1674 union perf_event *event __maybe_unused,
1675 struct perf_sample *sample)
1677 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1678 struct thread_trace *ttrace;
1679 size_t filename_len, entry_str_len, to_move;
1680 ssize_t remaining_space;
1682 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1687 ttrace = thread__priv(thread);
1691 filename_len = strlen(filename);
1693 if (ttrace->filename.namelen < filename_len) {
1694 char *f = realloc(ttrace->filename.name, filename_len + 1);
1699 ttrace->filename.namelen = filename_len;
1700 ttrace->filename.name = f;
1703 strcpy(ttrace->filename.name, filename);
1704 ttrace->filename.pending_open = true;
1706 if (!ttrace->filename.ptr)
1709 entry_str_len = strlen(ttrace->entry_str);
1710 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1711 if (remaining_space <= 0)
1714 if (filename_len > (size_t)remaining_space) {
1715 filename += filename_len - remaining_space;
1716 filename_len = remaining_space;
1719 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1720 pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1721 memmove(pos + filename_len, pos, to_move);
1722 memcpy(pos, filename, filename_len);
1724 ttrace->filename.ptr = 0;
1725 ttrace->filename.entry_str_pos = 0;
1730 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1731 union perf_event *event __maybe_unused,
1732 struct perf_sample *sample)
1734 u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1735 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1736 struct thread *thread = machine__findnew_thread(trace->host,
1739 struct thread_trace *ttrace = thread__trace(thread, trace->output);
1744 ttrace->runtime_ms += runtime_ms;
1745 trace->runtime_ms += runtime_ms;
1746 thread__put(thread);
1750 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1752 perf_evsel__strval(evsel, sample, "comm"),
1753 (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1755 perf_evsel__intval(evsel, sample, "vruntime"));
1756 thread__put(thread);
1760 static void bpf_output__printer(enum binary_printer_ops op,
1761 unsigned int val, void *extra)
1763 FILE *output = extra;
1764 unsigned char ch = (unsigned char)val;
1767 case BINARY_PRINT_CHAR_DATA:
1768 fprintf(output, "%c", isprint(ch) ? ch : '.');
1770 case BINARY_PRINT_DATA_BEGIN:
1771 case BINARY_PRINT_LINE_BEGIN:
1772 case BINARY_PRINT_ADDR:
1773 case BINARY_PRINT_NUM_DATA:
1774 case BINARY_PRINT_NUM_PAD:
1775 case BINARY_PRINT_SEP:
1776 case BINARY_PRINT_CHAR_PAD:
1777 case BINARY_PRINT_LINE_END:
1778 case BINARY_PRINT_DATA_END:
1784 static void bpf_output__fprintf(struct trace *trace,
1785 struct perf_sample *sample)
1787 print_binary(sample->raw_data, sample->raw_size, 8,
1788 bpf_output__printer, trace->output);
1791 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1792 union perf_event *event __maybe_unused,
1793 struct perf_sample *sample)
1795 int callchain_ret = 0;
1797 if (sample->callchain) {
1798 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1799 if (callchain_ret == 0) {
1800 if (callchain_cursor.nr < trace->min_stack)
1806 trace__printf_interrupted_entry(trace, sample);
1807 trace__fprintf_tstamp(trace, sample->time, trace->output);
1809 if (trace->trace_syscalls)
1810 fprintf(trace->output, "( ): ");
1812 fprintf(trace->output, "%s:", evsel->name);
1814 if (perf_evsel__is_bpf_output(evsel)) {
1815 bpf_output__fprintf(trace, sample);
1816 } else if (evsel->tp_format) {
1817 event_format__fprintf(evsel->tp_format, sample->cpu,
1818 sample->raw_data, sample->raw_size,
1822 fprintf(trace->output, ")\n");
1824 if (callchain_ret > 0)
1825 trace__fprintf_callchain(trace, sample);
1826 else if (callchain_ret < 0)
1827 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1832 static void print_location(FILE *f, struct perf_sample *sample,
1833 struct addr_location *al,
1834 bool print_dso, bool print_sym)
1837 if ((verbose || print_dso) && al->map)
1838 fprintf(f, "%s@", al->map->dso->long_name);
1840 if ((verbose || print_sym) && al->sym)
1841 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1842 al->addr - al->sym->start);
1844 fprintf(f, "0x%" PRIx64, al->addr);
1846 fprintf(f, "0x%" PRIx64, sample->addr);
1849 static int trace__pgfault(struct trace *trace,
1850 struct perf_evsel *evsel,
1851 union perf_event *event __maybe_unused,
1852 struct perf_sample *sample)
1854 struct thread *thread;
1855 struct addr_location al;
1856 char map_type = 'd';
1857 struct thread_trace *ttrace;
1859 int callchain_ret = 0;
1861 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1863 if (sample->callchain) {
1864 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1865 if (callchain_ret == 0) {
1866 if (callchain_cursor.nr < trace->min_stack)
1872 ttrace = thread__trace(thread, trace->output);
1876 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1881 if (trace->summary_only)
1884 thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1887 trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1889 fprintf(trace->output, "%sfault [",
1890 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1893 print_location(trace->output, sample, &al, false, true);
1895 fprintf(trace->output, "] => ");
1897 thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1901 thread__find_addr_location(thread, sample->cpumode,
1902 MAP__FUNCTION, sample->addr, &al);
1910 print_location(trace->output, sample, &al, true, false);
1912 fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1914 if (callchain_ret > 0)
1915 trace__fprintf_callchain(trace, sample);
1916 else if (callchain_ret < 0)
1917 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1921 thread__put(thread);
1925 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
1927 if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
1928 (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
1931 if (trace->pid_list || trace->tid_list)
1937 static void trace__set_base_time(struct trace *trace,
1938 struct perf_evsel *evsel,
1939 struct perf_sample *sample)
1942 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1943 * and don't use sample->time unconditionally, we may end up having
1944 * some other event in the future without PERF_SAMPLE_TIME for good
1945 * reason, i.e. we may not be interested in its timestamps, just in
1946 * it taking place, picking some piece of information when it
1947 * appears in our event stream (vfs_getname comes to mind).
1949 if (trace->base_time == 0 && !trace->full_time &&
1950 (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1951 trace->base_time = sample->time;
1954 static int trace__process_sample(struct perf_tool *tool,
1955 union perf_event *event,
1956 struct perf_sample *sample,
1957 struct perf_evsel *evsel,
1958 struct machine *machine __maybe_unused)
1960 struct trace *trace = container_of(tool, struct trace, tool);
1963 tracepoint_handler handler = evsel->handler;
1965 if (skip_sample(trace, sample))
1968 trace__set_base_time(trace, evsel, sample);
1972 handler(trace, evsel, event, sample);
1978 static int parse_target_str(struct trace *trace)
1980 if (trace->opts.target.pid) {
1981 trace->pid_list = intlist__new(trace->opts.target.pid);
1982 if (trace->pid_list == NULL) {
1983 pr_err("Error parsing process id string\n");
1988 if (trace->opts.target.tid) {
1989 trace->tid_list = intlist__new(trace->opts.target.tid);
1990 if (trace->tid_list == NULL) {
1991 pr_err("Error parsing thread id string\n");
1999 static int trace__record(struct trace *trace, int argc, const char **argv)
2001 unsigned int rec_argc, i, j;
2002 const char **rec_argv;
2003 const char * const record_args[] = {
2010 const char * const sc_args[] = { "-e", };
2011 unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2012 const char * const majpf_args[] = { "-e", "major-faults" };
2013 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2014 const char * const minpf_args[] = { "-e", "minor-faults" };
2015 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2017 /* +1 is for the event string below */
2018 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2019 majpf_args_nr + minpf_args_nr + argc;
2020 rec_argv = calloc(rec_argc + 1, sizeof(char *));
2022 if (rec_argv == NULL)
2026 for (i = 0; i < ARRAY_SIZE(record_args); i++)
2027 rec_argv[j++] = record_args[i];
2029 if (trace->trace_syscalls) {
2030 for (i = 0; i < sc_args_nr; i++)
2031 rec_argv[j++] = sc_args[i];
2033 /* event string may be different for older kernels - e.g., RHEL6 */
2034 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2035 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2036 else if (is_valid_tracepoint("syscalls:sys_enter"))
2037 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2039 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2044 if (trace->trace_pgfaults & TRACE_PFMAJ)
2045 for (i = 0; i < majpf_args_nr; i++)
2046 rec_argv[j++] = majpf_args[i];
2048 if (trace->trace_pgfaults & TRACE_PFMIN)
2049 for (i = 0; i < minpf_args_nr; i++)
2050 rec_argv[j++] = minpf_args[i];
2052 for (i = 0; i < (unsigned int)argc; i++)
2053 rec_argv[j++] = argv[i];
2055 return cmd_record(j, rec_argv, NULL);
2058 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2060 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2062 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2067 if (perf_evsel__field(evsel, "pathname") == NULL) {
2068 perf_evsel__delete(evsel);
2072 evsel->handler = trace__vfs_getname;
2073 perf_evlist__add(evlist, evsel);
2077 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2079 struct perf_evsel *evsel;
2080 struct perf_event_attr attr = {
2081 .type = PERF_TYPE_SOFTWARE,
2085 attr.config = config;
2086 attr.sample_period = 1;
2088 event_attr_init(&attr);
2090 evsel = perf_evsel__new(&attr);
2092 evsel->handler = trace__pgfault;
2097 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2099 const u32 type = event->header.type;
2100 struct perf_evsel *evsel;
2102 if (type != PERF_RECORD_SAMPLE) {
2103 trace__process_event(trace, trace->host, event, sample);
2107 evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2108 if (evsel == NULL) {
2109 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2113 trace__set_base_time(trace, evsel, sample);
2115 if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2116 sample->raw_data == NULL) {
2117 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2118 perf_evsel__name(evsel), sample->tid,
2119 sample->cpu, sample->raw_size);
2121 tracepoint_handler handler = evsel->handler;
2122 handler(trace, evsel, event, sample);
2126 static int trace__add_syscall_newtp(struct trace *trace)
2129 struct perf_evlist *evlist = trace->evlist;
2130 struct perf_evsel *sys_enter, *sys_exit;
2132 sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2133 if (sys_enter == NULL)
2136 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2137 goto out_delete_sys_enter;
2139 sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2140 if (sys_exit == NULL)
2141 goto out_delete_sys_enter;
2143 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2144 goto out_delete_sys_exit;
2146 perf_evlist__add(evlist, sys_enter);
2147 perf_evlist__add(evlist, sys_exit);
2149 if (callchain_param.enabled && !trace->kernel_syscallchains) {
2151 * We're interested only in the user space callchain
2152 * leading to the syscall, allow overriding that for
2153 * debugging reasons using --kernel_syscall_callchains
2155 sys_exit->attr.exclude_callchain_kernel = 1;
2158 trace->syscalls.events.sys_enter = sys_enter;
2159 trace->syscalls.events.sys_exit = sys_exit;
2165 out_delete_sys_exit:
2166 perf_evsel__delete_priv(sys_exit);
2167 out_delete_sys_enter:
2168 perf_evsel__delete_priv(sys_enter);
2172 static int trace__set_ev_qualifier_filter(struct trace *trace)
2175 struct perf_evsel *sys_exit;
2176 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2177 trace->ev_qualifier_ids.nr,
2178 trace->ev_qualifier_ids.entries);
2183 if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2185 sys_exit = trace->syscalls.events.sys_exit;
2186 err = perf_evsel__append_tp_filter(sys_exit, filter);
2197 static int trace__run(struct trace *trace, int argc, const char **argv)
2199 struct perf_evlist *evlist = trace->evlist;
2200 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2202 unsigned long before;
2203 const bool forks = argc > 0;
2204 bool draining = false;
2208 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2209 goto out_error_raw_syscalls;
2211 if (trace->trace_syscalls)
2212 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2214 if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2215 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2216 if (pgfault_maj == NULL)
2218 perf_evlist__add(evlist, pgfault_maj);
2221 if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2222 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2223 if (pgfault_min == NULL)
2225 perf_evlist__add(evlist, pgfault_min);
2229 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2230 trace__sched_stat_runtime))
2231 goto out_error_sched_stat_runtime;
2233 err = perf_evlist__create_maps(evlist, &trace->opts.target);
2235 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2236 goto out_delete_evlist;
2239 err = trace__symbols_init(trace, evlist);
2241 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2242 goto out_delete_evlist;
2245 perf_evlist__config(evlist, &trace->opts, NULL);
2247 if (callchain_param.enabled) {
2248 bool use_identifier = false;
2250 if (trace->syscalls.events.sys_exit) {
2251 perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2252 &trace->opts, &callchain_param);
2253 use_identifier = true;
2257 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2258 use_identifier = true;
2262 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2263 use_identifier = true;
2266 if (use_identifier) {
2268 * Now we have evsels with different sample_ids, use
2269 * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2270 * from a fixed position in each ring buffer record.
2272 * As of this the changeset introducing this comment, this
2273 * isn't strictly needed, as the fields that can come before
2274 * PERF_SAMPLE_ID are all used, but we'll probably disable
2275 * some of those for things like copying the payload of
2276 * pointer syscall arguments, and for vfs_getname we don't
2277 * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2278 * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2280 perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2281 perf_evlist__reset_sample_bit(evlist, ID);
2285 signal(SIGCHLD, sig_handler);
2286 signal(SIGINT, sig_handler);
2289 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2292 fprintf(trace->output, "Couldn't run the workload!\n");
2293 goto out_delete_evlist;
2297 err = perf_evlist__open(evlist);
2299 goto out_error_open;
2301 err = bpf__apply_obj_config();
2303 char errbuf[BUFSIZ];
2305 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2306 pr_err("ERROR: Apply config to BPF failed: %s\n",
2308 goto out_error_open;
2312 * Better not use !target__has_task() here because we need to cover the
2313 * case where no threads were specified in the command line, but a
2314 * workload was, and in that case we will fill in the thread_map when
2315 * we fork the workload in perf_evlist__prepare_workload.
2317 if (trace->filter_pids.nr > 0)
2318 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2319 else if (thread_map__pid(evlist->threads, 0) == -1)
2320 err = perf_evlist__set_filter_pid(evlist, getpid());
2325 if (trace->ev_qualifier_ids.nr > 0) {
2326 err = trace__set_ev_qualifier_filter(trace);
2330 pr_debug("event qualifier tracepoint filter: %s\n",
2331 trace->syscalls.events.sys_exit->filter);
2334 err = perf_evlist__apply_filters(evlist, &evsel);
2336 goto out_error_apply_filters;
2338 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2340 goto out_error_mmap;
2342 if (!target__none(&trace->opts.target))
2343 perf_evlist__enable(evlist);
2346 perf_evlist__start_workload(evlist);
2348 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2349 evlist->threads->nr > 1 ||
2350 perf_evlist__first(evlist)->attr.inherit;
2352 before = trace->nr_events;
2354 for (i = 0; i < evlist->nr_mmaps; i++) {
2355 union perf_event *event;
2357 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2358 struct perf_sample sample;
2362 err = perf_evlist__parse_sample(evlist, event, &sample);
2364 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2368 trace__handle_event(trace, event, &sample);
2370 perf_evlist__mmap_consume(evlist, i);
2375 if (done && !draining) {
2376 perf_evlist__disable(evlist);
2382 if (trace->nr_events == before) {
2383 int timeout = done ? 100 : -1;
2385 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2386 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2396 thread__zput(trace->current);
2398 perf_evlist__disable(evlist);
2402 trace__fprintf_thread_summary(trace, trace->output);
2404 if (trace->show_tool_stats) {
2405 fprintf(trace->output, "Stats:\n "
2406 " vfs_getname : %" PRIu64 "\n"
2407 " proc_getname: %" PRIu64 "\n",
2408 trace->stats.vfs_getname,
2409 trace->stats.proc_getname);
2414 perf_evlist__delete(evlist);
2415 trace->evlist = NULL;
2416 trace->live = false;
2419 char errbuf[BUFSIZ];
2421 out_error_sched_stat_runtime:
2422 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2425 out_error_raw_syscalls:
2426 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2430 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2434 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2437 fprintf(trace->output, "%s\n", errbuf);
2438 goto out_delete_evlist;
2440 out_error_apply_filters:
2441 fprintf(trace->output,
2442 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2443 evsel->filter, perf_evsel__name(evsel), errno,
2444 str_error_r(errno, errbuf, sizeof(errbuf)));
2445 goto out_delete_evlist;
2448 fprintf(trace->output, "Not enough memory to run!\n");
2449 goto out_delete_evlist;
2452 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2453 goto out_delete_evlist;
2456 static int trace__replay(struct trace *trace)
2458 const struct perf_evsel_str_handler handlers[] = {
2459 { "probe:vfs_getname", trace__vfs_getname, },
2461 struct perf_data_file file = {
2463 .mode = PERF_DATA_MODE_READ,
2464 .force = trace->force,
2466 struct perf_session *session;
2467 struct perf_evsel *evsel;
2470 trace->tool.sample = trace__process_sample;
2471 trace->tool.mmap = perf_event__process_mmap;
2472 trace->tool.mmap2 = perf_event__process_mmap2;
2473 trace->tool.comm = perf_event__process_comm;
2474 trace->tool.exit = perf_event__process_exit;
2475 trace->tool.fork = perf_event__process_fork;
2476 trace->tool.attr = perf_event__process_attr;
2477 trace->tool.tracing_data = perf_event__process_tracing_data;
2478 trace->tool.build_id = perf_event__process_build_id;
2480 trace->tool.ordered_events = true;
2481 trace->tool.ordering_requires_timestamps = true;
2483 /* add tid to output */
2484 trace->multiple_threads = true;
2486 session = perf_session__new(&file, false, &trace->tool);
2487 if (session == NULL)
2490 if (symbol__init(&session->header.env) < 0)
2493 trace->host = &session->machines.host;
2495 err = perf_session__set_tracepoints_handlers(session, handlers);
2499 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2500 "raw_syscalls:sys_enter");
2501 /* older kernels have syscalls tp versus raw_syscalls */
2503 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2504 "syscalls:sys_enter");
2507 (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2508 perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2509 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2513 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2514 "raw_syscalls:sys_exit");
2516 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2517 "syscalls:sys_exit");
2519 (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2520 perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2521 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2525 evlist__for_each_entry(session->evlist, evsel) {
2526 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2527 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2528 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2529 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2530 evsel->handler = trace__pgfault;
2533 err = parse_target_str(trace);
2539 err = perf_session__process_events(session);
2541 pr_err("Failed to process events, error %d", err);
2543 else if (trace->summary)
2544 trace__fprintf_thread_summary(trace, trace->output);
2547 perf_session__delete(session);
2552 static size_t trace__fprintf_threads_header(FILE *fp)
2556 printed = fprintf(fp, "\n Summary of events:\n\n");
2561 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2562 struct stats *stats;
2567 struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2568 struct stats *stats = source->priv;
2570 entry->syscall = source->i;
2571 entry->stats = stats;
2572 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2575 static size_t thread__dump_stats(struct thread_trace *ttrace,
2576 struct trace *trace, FILE *fp)
2581 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2583 if (syscall_stats == NULL)
2586 printed += fprintf(fp, "\n");
2588 printed += fprintf(fp, " syscall calls total min avg max stddev\n");
2589 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
2590 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n");
2592 resort_rb__for_each_entry(nd, syscall_stats) {
2593 struct stats *stats = syscall_stats_entry->stats;
2595 double min = (double)(stats->min) / NSEC_PER_MSEC;
2596 double max = (double)(stats->max) / NSEC_PER_MSEC;
2597 double avg = avg_stats(stats);
2599 u64 n = (u64) stats->n;
2601 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2602 avg /= NSEC_PER_MSEC;
2604 sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2605 printed += fprintf(fp, " %-15s", sc->name);
2606 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2607 n, syscall_stats_entry->msecs, min, avg);
2608 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2612 resort_rb__delete(syscall_stats);
2613 printed += fprintf(fp, "\n\n");
2618 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2621 struct thread_trace *ttrace = thread__priv(thread);
2627 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2629 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2630 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2631 printed += fprintf(fp, "%.1f%%", ratio);
2633 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2635 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2637 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2638 else if (fputc('\n', fp) != EOF)
2641 printed += thread__dump_stats(ttrace, trace, fp);
2646 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2648 return ttrace ? ttrace->nr_events : 0;
2651 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2652 struct thread *thread;
2655 entry->thread = rb_entry(nd, struct thread, rb_node);
2658 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2660 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2661 size_t printed = trace__fprintf_threads_header(fp);
2664 if (threads == NULL) {
2665 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2669 resort_rb__for_each_entry(nd, threads)
2670 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2672 resort_rb__delete(threads);
2677 static int trace__set_duration(const struct option *opt, const char *str,
2678 int unset __maybe_unused)
2680 struct trace *trace = opt->value;
2682 trace->duration_filter = atof(str);
2686 static int trace__set_filter_pids(const struct option *opt, const char *str,
2687 int unset __maybe_unused)
2691 struct trace *trace = opt->value;
2693 * FIXME: introduce a intarray class, plain parse csv and create a
2694 * { int nr, int entries[] } struct...
2696 struct intlist *list = intlist__new(str);
2701 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2702 trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2704 if (trace->filter_pids.entries == NULL)
2707 trace->filter_pids.entries[0] = getpid();
2709 for (i = 1; i < trace->filter_pids.nr; ++i)
2710 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2712 intlist__delete(list);
2718 static int trace__open_output(struct trace *trace, const char *filename)
2722 if (!stat(filename, &st) && st.st_size) {
2723 char oldname[PATH_MAX];
2725 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2727 rename(filename, oldname);
2730 trace->output = fopen(filename, "w");
2732 return trace->output == NULL ? -errno : 0;
2735 static int parse_pagefaults(const struct option *opt, const char *str,
2736 int unset __maybe_unused)
2738 int *trace_pgfaults = opt->value;
2740 if (strcmp(str, "all") == 0)
2741 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2742 else if (strcmp(str, "maj") == 0)
2743 *trace_pgfaults |= TRACE_PFMAJ;
2744 else if (strcmp(str, "min") == 0)
2745 *trace_pgfaults |= TRACE_PFMIN;
2752 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2754 struct perf_evsel *evsel;
2756 evlist__for_each_entry(evlist, evsel)
2757 evsel->handler = handler;
2760 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2762 const char *trace_usage[] = {
2763 "perf trace [<options>] [<command>]",
2764 "perf trace [<options>] -- <command> [<options>]",
2765 "perf trace record [<options>] [<command>]",
2766 "perf trace record [<options>] -- <command> [<options>]",
2769 struct trace trace = {
2778 .user_freq = UINT_MAX,
2779 .user_interval = ULLONG_MAX,
2780 .no_buffering = true,
2781 .mmap_pages = UINT_MAX,
2782 .proc_map_timeout = 500,
2786 .trace_syscalls = true,
2787 .kernel_syscallchains = false,
2788 .max_stack = UINT_MAX,
2790 const char *output_name = NULL;
2791 const char *ev_qualifier_str = NULL;
2792 const struct option trace_options[] = {
2793 OPT_CALLBACK(0, "event", &trace.evlist, "event",
2794 "event selector. use 'perf list' to list available events",
2795 parse_events_option),
2796 OPT_BOOLEAN(0, "comm", &trace.show_comm,
2797 "show the thread COMM next to its id"),
2798 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2799 OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
2800 OPT_STRING('o', "output", &output_name, "file", "output file name"),
2801 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2802 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2803 "trace events on existing process id"),
2804 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2805 "trace events on existing thread id"),
2806 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2807 "pids to filter (by the kernel)", trace__set_filter_pids),
2808 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2809 "system-wide collection from all CPUs"),
2810 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2811 "list of cpus to monitor"),
2812 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2813 "child tasks do not inherit counters"),
2814 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2815 "number of mmap data pages",
2816 perf_evlist__parse_mmap_pages),
2817 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2819 OPT_CALLBACK(0, "duration", &trace, "float",
2820 "show only events with duration > N.M ms",
2821 trace__set_duration),
2822 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2823 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2824 OPT_BOOLEAN('T', "time", &trace.full_time,
2825 "Show full timestamp, not time relative to first start"),
2826 OPT_BOOLEAN('s', "summary", &trace.summary_only,
2827 "Show only syscall summary with statistics"),
2828 OPT_BOOLEAN('S', "with-summary", &trace.summary,
2829 "Show all syscalls and summary with statistics"),
2830 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2831 "Trace pagefaults", parse_pagefaults, "maj"),
2832 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2833 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2834 OPT_CALLBACK(0, "call-graph", &trace.opts,
2835 "record_mode[,record_size]", record_callchain_help,
2836 &record_parse_callchain_opt),
2837 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2838 "Show the kernel callchains on the syscall exit path"),
2839 OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2840 "Set the minimum stack depth when parsing the callchain, "
2841 "anything below the specified depth will be ignored."),
2842 OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2843 "Set the maximum stack depth when parsing the callchain, "
2844 "anything beyond the specified depth will be ignored. "
2845 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2846 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2847 "per thread proc mmap processing timeout in ms"),
2850 bool __maybe_unused max_stack_user_set = true;
2851 bool mmap_pages_user_set = true;
2852 const char * const trace_subcommands[] = { "record", NULL };
2856 signal(SIGSEGV, sighandler_dump_stack);
2857 signal(SIGFPE, sighandler_dump_stack);
2859 trace.evlist = perf_evlist__new();
2860 trace.sctbl = syscalltbl__new();
2862 if (trace.evlist == NULL || trace.sctbl == NULL) {
2863 pr_err("Not enough memory to run!\n");
2868 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2869 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2871 err = bpf__setup_stdout(trace.evlist);
2873 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2874 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2880 if (trace.trace_pgfaults) {
2881 trace.opts.sample_address = true;
2882 trace.opts.sample_time = true;
2885 if (trace.opts.mmap_pages == UINT_MAX)
2886 mmap_pages_user_set = false;
2888 if (trace.max_stack == UINT_MAX) {
2889 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2890 max_stack_user_set = false;
2893 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2894 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2895 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2898 if (callchain_param.enabled) {
2899 if (!mmap_pages_user_set && geteuid() == 0)
2900 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2902 symbol_conf.use_callchain = true;
2905 if (trace.evlist->nr_entries > 0)
2906 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2908 if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2909 return trace__record(&trace, argc-1, &argv[1]);
2911 /* summary_only implies summary option, but don't overwrite summary if set */
2912 if (trace.summary_only)
2913 trace.summary = trace.summary_only;
2915 if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2916 trace.evlist->nr_entries == 0 /* Was --events used? */) {
2917 pr_err("Please specify something to trace.\n");
2921 if (!trace.trace_syscalls && ev_qualifier_str) {
2922 pr_err("The -e option can't be used with --no-syscalls.\n");
2926 if (output_name != NULL) {
2927 err = trace__open_output(&trace, output_name);
2929 perror("failed to create output file");
2934 trace.open_id = syscalltbl__id(trace.sctbl, "open");
2936 if (ev_qualifier_str != NULL) {
2937 const char *s = ev_qualifier_str;
2938 struct strlist_config slist_config = {
2939 .dirname = system_path(STRACE_GROUPS_DIR),
2942 trace.not_ev_qualifier = *s == '!';
2943 if (trace.not_ev_qualifier)
2945 trace.ev_qualifier = strlist__new(s, &slist_config);
2946 if (trace.ev_qualifier == NULL) {
2947 fputs("Not enough memory to parse event qualifier",
2953 err = trace__validate_ev_qualifier(&trace);
2958 err = target__validate(&trace.opts.target);
2960 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2961 fprintf(trace.output, "%s", bf);
2965 err = target__parse_uid(&trace.opts.target);
2967 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2968 fprintf(trace.output, "%s", bf);
2972 if (!argc && target__none(&trace.opts.target))
2973 trace.opts.target.system_wide = true;
2976 err = trace__replay(&trace);
2978 err = trace__run(&trace, argc, argv);
2981 if (output_name != NULL)
2982 fclose(trace.output);