3 # top-like utility for displaying kvm statistics
5 # Copyright 2006-2008 Qumranet Technologies
6 # Copyright 2008-2011 Red Hat, Inc.
9 # Avi Kivity <avi@redhat.com>
11 # This work is licensed under the terms of the GNU GPL, version 2. See
12 # the COPYING file in the top-level directory.
13 """The kvm_stat module outputs statistics about running KVM VMs
15 Three different ways of output formatting are available:
16 - as a top-like text ui
17 - in a key -> value format
18 - in an all keys, all values format
20 The data is sampled from the KVM's debugfs entries and its perf events.
33 from collections import defaultdict
34 from time import sleep
38 'EXTERNAL_INTERRUPT': 1,
40 'PENDING_INTERRUPT': 7,
64 'MWAIT_INSTRUCTION': 36,
65 'MONITOR_INSTRUCTION': 39,
66 'PAUSE_INSTRUCTION': 40,
67 'MCE_DURING_VMENTRY': 41,
68 'TPR_BELOW_THRESHOLD': 43,
109 'CR0_SEL_WRITE': 0x065,
133 'TASK_SWITCH': 0x07d,
134 'FERR_FREEZE': 0x07e,
153 # EC definition of HSR (from arch/arm64/include/asm/kvm_arm.h)
154 AARCH64_EXIT_REASONS = {
192 # From include/uapi/linux/kvm.h, KVM_EXIT_xxx
193 USERSPACE_EXIT_REASONS = {
201 'IRQ_WINDOW_OPEN': 7,
211 'INTERNAL_ERROR': 17,
222 'SET_FILTER': 0x40082406,
223 'ENABLE': 0x00002400,
224 'DISABLE': 0x00002401,
229 """Encapsulates global architecture specific data.
231 Contains the performance event open syscall and ioctl numbers, as
232 well as the VM exit reasons for the architecture it runs on.
237 machine = os.uname()[4]
239 if machine.startswith('ppc'):
241 elif machine.startswith('aarch64'):
243 elif machine.startswith('s390'):
247 for line in open('/proc/cpuinfo'):
248 if not line.startswith('flags'):
253 return ArchX86(VMX_EXIT_REASONS)
255 return ArchX86(SVM_EXIT_REASONS)
259 def __init__(self, exit_reasons):
260 self.sc_perf_evt_open = 298
261 self.ioctl_numbers = IOCTL_NUMBERS
262 self.exit_reasons = exit_reasons
266 self.sc_perf_evt_open = 319
267 self.ioctl_numbers = IOCTL_NUMBERS
268 self.ioctl_numbers['ENABLE'] = 0x20002400
269 self.ioctl_numbers['DISABLE'] = 0x20002401
270 self.ioctl_numbers['RESET'] = 0x20002403
272 # PPC comes in 32 and 64 bit and some generated ioctl
273 # numbers depend on the wordsize.
274 char_ptr_size = ctypes.sizeof(ctypes.c_char_p)
275 self.ioctl_numbers['SET_FILTER'] = 0x80002406 | char_ptr_size << 16
276 self.exit_reasons = {}
280 self.sc_perf_evt_open = 241
281 self.ioctl_numbers = IOCTL_NUMBERS
282 self.exit_reasons = AARCH64_EXIT_REASONS
284 class ArchS390(Arch):
286 self.sc_perf_evt_open = 331
287 self.ioctl_numbers = IOCTL_NUMBERS
288 self.exit_reasons = None
290 ARCH = Arch.get_arch()
294 """Returns os.walk() data for specified directory.
296 As it is only a wrapper it returns the same 3-tuple of (dirpath,
297 dirnames, filenames).
299 return next(os.walk(path))
302 def parse_int_list(list_string):
303 """Returns an int list from a string of comma separated integers and
306 members = list_string.split(',')
308 for member in members:
309 if '-' not in member:
310 integers.append(int(member))
312 int_range = member.split('-')
313 integers.extend(range(int(int_range[0]),
314 int(int_range[1]) + 1))
319 def get_online_cpus():
320 """Returns a list of cpu id integers."""
321 with open('/sys/devices/system/cpu/online') as cpu_list:
322 cpu_string = cpu_list.readline()
323 return parse_int_list(cpu_string)
327 """Returns a dict of trace events, their filter ids and
328 the values that can be filtered.
330 Trace events can be filtered for special values by setting a
331 filter string via an ioctl. The string normally has the format
332 identifier==value. For each filter a new event will be created, to
333 be able to distinguish the events.
337 filters['kvm_userspace_exit'] = ('reason', USERSPACE_EXIT_REASONS)
338 if ARCH.exit_reasons:
339 filters['kvm_exit'] = ('exit_reason', ARCH.exit_reasons)
342 libc = ctypes.CDLL('libc.so.6', use_errno=True)
343 syscall = libc.syscall
345 class perf_event_attr(ctypes.Structure):
346 """Struct that holds the necessary data to set up a trace event.
348 For an extensive explanation see perf_event_open(2) and
349 include/uapi/linux/perf_event.h, struct perf_event_attr
351 All fields that are not initialized in the constructor are 0.
354 _fields_ = [('type', ctypes.c_uint32),
355 ('size', ctypes.c_uint32),
356 ('config', ctypes.c_uint64),
357 ('sample_freq', ctypes.c_uint64),
358 ('sample_type', ctypes.c_uint64),
359 ('read_format', ctypes.c_uint64),
360 ('flags', ctypes.c_uint64),
361 ('wakeup_events', ctypes.c_uint32),
362 ('bp_type', ctypes.c_uint32),
363 ('bp_addr', ctypes.c_uint64),
364 ('bp_len', ctypes.c_uint64),
368 super(self.__class__, self).__init__()
369 self.type = PERF_TYPE_TRACEPOINT
370 self.size = ctypes.sizeof(self)
371 self.read_format = PERF_FORMAT_GROUP
373 def perf_event_open(attr, pid, cpu, group_fd, flags):
374 """Wrapper for the sys_perf_evt_open() syscall.
376 Used to set up performance events, returns a file descriptor or -1
381 - struct perf_event_attr *
382 - pid or -1 to monitor all pids
383 - cpu number or -1 to monitor all cpus
384 - The file descriptor of the group leader or -1 to create a group.
388 return syscall(ARCH.sc_perf_evt_open, ctypes.pointer(attr),
389 ctypes.c_int(pid), ctypes.c_int(cpu),
390 ctypes.c_int(group_fd), ctypes.c_long(flags))
392 PERF_TYPE_TRACEPOINT = 2
393 PERF_FORMAT_GROUP = 1 << 3
395 PATH_DEBUGFS_TRACING = '/sys/kernel/debug/tracing'
396 PATH_DEBUGFS_KVM = '/sys/kernel/debug/kvm'
399 """Represents a perf event group."""
404 def add_event(self, event):
405 self.events.append(event)
408 """Returns a dict with 'event name: value' for all events in the
411 Values are read by reading from the file descriptor of the
412 event that is the group leader. See perf_event_open(2) for
415 Read format for the used event configuration is:
417 u64 nr; /* The number of events */
419 u64 value; /* The value of the event */
424 length = 8 * (1 + len(self.events))
425 read_format = 'xxxxxxxx' + 'Q' * len(self.events)
426 return dict(zip([event.name for event in self.events],
427 struct.unpack(read_format,
428 os.read(self.events[0].fd, length))))
431 """Represents a performance event and manages its life cycle."""
432 def __init__(self, name, group, trace_cpu, trace_pid, trace_point,
433 trace_filter, trace_set='kvm'):
436 self.setup_event(group, trace_cpu, trace_pid, trace_point,
437 trace_filter, trace_set)
440 """Closes the event's file descriptor.
442 As no python file object was created for the file descriptor,
443 python will not reference count the descriptor and will not
444 close it itself automatically, so we do it.
450 def setup_event_attribute(self, trace_set, trace_point):
451 """Returns an initialized ctype perf_event_attr struct."""
453 id_path = os.path.join(PATH_DEBUGFS_TRACING, 'events', trace_set,
456 event_attr = perf_event_attr()
457 event_attr.config = int(open(id_path).read())
460 def setup_event(self, group, trace_cpu, trace_pid, trace_point,
461 trace_filter, trace_set):
462 """Sets up the perf event in Linux.
464 Issues the syscall to register the event in the kernel and
465 then sets the optional filter.
469 event_attr = self.setup_event_attribute(trace_set, trace_point)
471 # First event will be group leader.
474 # All others have to pass the leader's descriptor instead.
476 group_leader = group.events[0].fd
478 fd = perf_event_open(event_attr, trace_pid,
479 trace_cpu, group_leader, 0)
481 err = ctypes.get_errno()
482 raise OSError(err, os.strerror(err),
483 'while calling sys_perf_event_open().')
486 fcntl.ioctl(fd, ARCH.ioctl_numbers['SET_FILTER'],
492 """Enables the trace event in the kernel.
494 Enabling the group leader makes reading counters from it and the
495 events under it possible.
498 fcntl.ioctl(self.fd, ARCH.ioctl_numbers['ENABLE'], 0)
501 """Disables the trace event in the kernel.
503 Disabling the group leader makes reading all counters under it
507 fcntl.ioctl(self.fd, ARCH.ioctl_numbers['DISABLE'], 0)
510 """Resets the count of the trace event in the kernel."""
511 fcntl.ioctl(self.fd, ARCH.ioctl_numbers['RESET'], 0)
513 class TracepointProvider(object):
514 """Data provider for the stats class.
516 Manages the events/groups from which it acquires its data.
520 self.group_leaders = []
521 self.filters = get_filters()
522 self._fields = self.get_available_fields()
525 def get_available_fields(self):
526 """Returns a list of available event's of format 'event name(filter
529 All available events have directories under
530 /sys/kernel/debug/tracing/events/ which export information
531 about the specific event. Therefore, listing the dirs gives us
532 a list of all available events.
534 Some events like the vm exit reasons can be filtered for
535 specific values. To take account for that, the routine below
536 creates special fields with the following format:
537 event name(filter name)
540 path = os.path.join(PATH_DEBUGFS_TRACING, 'events', 'kvm')
541 fields = walkdir(path)[1]
544 if field in self.filters:
545 filter_name_, filter_dicts = self.filters[field]
546 for name in filter_dicts:
547 extra.append(field + '(' + name + ')')
551 def setup_traces(self):
552 """Creates all event and group objects needed to be able to retrieve
555 # Fetch list of all threads of the monitored pid, as qemu
556 # starts a thread for each vcpu.
557 path = os.path.join('/proc', str(self._pid), 'task')
558 groupids = walkdir(path)[1]
560 groupids = get_online_cpus()
562 # The constant is needed as a buffer for python libs, std
563 # streams and other files that the script opens.
564 newlim = len(groupids) * len(self._fields) + 50
566 softlim_, hardlim = resource.getrlimit(resource.RLIMIT_NOFILE)
569 # Now we need CAP_SYS_RESOURCE, to increase the hard limit.
570 resource.setrlimit(resource.RLIMIT_NOFILE, (newlim, newlim))
572 # Raising the soft limit is sufficient.
573 resource.setrlimit(resource.RLIMIT_NOFILE, (newlim, hardlim))
576 sys.exit("NOFILE rlimit could not be raised to {0}".format(newlim))
578 for groupid in groupids:
580 for name in self._fields:
583 match = re.match(r'(.*)\((.*)\)', name)
585 tracepoint, sub = match.groups()
586 tracefilter = ('%s==%d\0' %
587 (self.filters[tracepoint][0],
588 self.filters[tracepoint][1][sub]))
590 # From perf_event_open(2):
591 # pid > 0 and cpu == -1
592 # This measures the specified process/thread on any CPU.
594 # pid == -1 and cpu >= 0
595 # This measures all processes/threads on the specified CPU.
596 trace_cpu = groupid if self._pid == 0 else -1
597 trace_pid = int(groupid) if self._pid != 0 else -1
599 group.add_event(Event(name=name,
603 trace_point=tracepoint,
604 trace_filter=tracefilter))
606 self.group_leaders.append(group)
608 def available_fields(self):
609 return self.get_available_fields()
616 def fields(self, fields):
617 """Enables/disables the (un)wanted events"""
618 self._fields = fields
619 for group in self.group_leaders:
620 for index, event in enumerate(group.events):
621 if event.name in fields:
625 # Do not disable the group leader.
626 # It would disable all of its events.
636 """Changes the monitored pid by setting new traces."""
638 # The garbage collector will get rid of all Event/Group
639 # objects and open files after removing the references.
640 self.group_leaders = []
642 self.fields = self._fields
645 """Returns 'event name: current value' for all enabled events."""
646 ret = defaultdict(int)
647 for group in self.group_leaders:
648 for name, val in group.read().iteritems():
649 if name in self._fields:
653 class DebugfsProvider(object):
654 """Provides data from the files that KVM creates in the kvm debugfs
657 self._fields = self.get_available_fields()
661 def get_available_fields(self):
662 """"Returns a list of available fields.
664 The fields are all available KVM debugfs files
667 return walkdir(PATH_DEBUGFS_KVM)[2]
674 def fields(self, fields):
675 self._fields = fields
686 vms = walkdir(PATH_DEBUGFS_KVM)[1]
690 self.paths = filter(lambda x: "{}-".format(pid) in x, vms)
697 """Returns a dict with format:'file name / field -> current value'."""
700 # If no debugfs filtering support is available, then don't read.
704 for path in self.paths:
705 for field in self._fields:
706 results[field] = results.get(field, 0) \
707 + self.read_field(field, path)
711 def read_field(self, field, path):
712 """Returns the value of a single field from a specific VM."""
714 return int(open(os.path.join(PATH_DEBUGFS_KVM,
722 """Manages the data providers and the data they provide.
724 It is used to set filters on the provider's data and collect all
728 def __init__(self, providers, pid, fields=None):
729 self.providers = providers
730 self._pid_filter = pid
731 self._fields_filter = fields
733 self.update_provider_pid()
734 self.update_provider_filters()
736 def update_provider_filters(self):
737 """Propagates fields filters to providers."""
739 if not self._fields_filter:
741 return re.match(self._fields_filter, key) is not None
743 # As we reset the counters when updating the fields we can
744 # also clear the cache of old values.
746 for provider in self.providers:
747 provider_fields = [key for key in provider.get_available_fields()
749 provider.fields = provider_fields
751 def update_provider_pid(self):
752 """Propagates pid filters to providers."""
753 for provider in self.providers:
754 provider.pid = self._pid_filter
757 def fields_filter(self):
758 return self._fields_filter
760 @fields_filter.setter
761 def fields_filter(self, fields_filter):
762 self._fields_filter = fields_filter
763 self.update_provider_filters()
766 def pid_filter(self):
767 return self._pid_filter
770 def pid_filter(self, pid):
771 self._pid_filter = pid
773 self.update_provider_pid()
776 """Returns a dict with field -> (value, delta to last value) of all
778 for provider in self.providers:
779 new = provider.read()
780 for key in provider.fields:
781 oldval = self.values.get(key, (0, 0))
782 newval = new.get(key, 0)
784 if oldval is not None:
785 newdelta = newval - oldval[0]
786 self.values[key] = (newval, newdelta)
793 """Instruments curses to draw a nice text ui."""
794 def __init__(self, stats):
797 self.drilldown = False
798 self.update_drilldown()
801 """Initialises curses for later use. Based on curses.wrapper
802 implementation from the Python standard library."""
803 self.screen = curses.initscr()
807 # The try/catch works around a minor bit of
808 # over-conscientiousness in the curses module, the error
809 # return from C start_color() is ignorable.
815 curses.use_default_colors()
818 def __exit__(self, *exception):
819 """Resets the terminal to its normal state. Based on curses.wrappre
820 implementation from the Python standard library."""
822 self.screen.keypad(0)
827 def update_drilldown(self):
828 """Sets or removes a filter that only allows fields without braces."""
829 if not self.stats.fields_filter:
830 self.stats.fields_filter = r'^[^\(]*$'
832 elif self.stats.fields_filter == r'^[^\(]*$':
833 self.stats.fields_filter = None
835 def update_pid(self, pid):
836 """Propagates pid selection to stats object."""
837 self.stats.pid_filter = pid
839 def refresh(self, sleeptime):
840 """Refreshes on-screen data."""
842 if self.stats.pid_filter > 0:
843 self.screen.addstr(0, 0, 'kvm statistics - pid {0}'
844 .format(self.stats.pid_filter),
847 self.screen.addstr(0, 0, 'kvm statistics - summary', curses.A_BOLD)
848 self.screen.addstr(2, 1, 'Event')
849 self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH -
850 len('Total'), 'Total')
851 self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH + 8 -
852 len('Current'), 'Current')
854 stats = self.stats.get()
857 return (-stats[x][1], -stats[x][0])
859 return (0, -stats[x][0])
860 for key in sorted(stats.keys(), key=sortkey):
862 if row >= self.screen.getmaxyx()[0]:
865 if not values[0] and not values[1]:
868 self.screen.addstr(row, col, key)
870 self.screen.addstr(row, col, '%10d' % (values[0],))
872 if values[1] is not None:
873 self.screen.addstr(row, col, '%8d' % (values[1] / sleeptime,))
875 self.screen.refresh()
877 def show_filter_selection(self):
878 """Draws filter selection mask.
880 Asks for a valid regex and sets the fields filter accordingly.
885 self.screen.addstr(0, 0,
886 "Show statistics for events matching a regex.",
888 self.screen.addstr(2, 0,
890 .format(self.stats.fields_filter))
891 self.screen.addstr(3, 0, "New regex: ")
893 regex = self.screen.getstr()
899 self.stats.fields_filter = regex
904 def show_vm_selection(self):
905 """Draws PID selection mask.
907 Asks for a pid until a valid pid or 0 has been entered.
912 self.screen.addstr(0, 0,
913 'Show statistics for specific pid.',
915 self.screen.addstr(1, 0,
916 'This might limit the shown data to the trace '
920 self.screen.addstr(3, 0, "Pid [0 or pid]: ")
921 pid = self.screen.getstr()
931 if not os.path.isdir(os.path.join('/proc/', str(pid))):
940 def show_stats(self):
941 """Refreshes the screen and processes user input."""
944 self.refresh(sleeptime)
945 curses.halfdelay(int(sleeptime * 10))
948 char = self.screen.getkey()
950 self.drilldown = not self.drilldown
951 self.update_drilldown()
955 self.show_filter_selection()
957 self.show_vm_selection()
958 except KeyboardInterrupt:
964 """Prints statistics in a key, value format."""
968 for key in sorted(s.keys()):
970 print '%-42s%10d%10d' % (key, values[0], values[1])
973 """Prints statistics as reiterating key block, multiple value blocks."""
974 keys = sorted(stats.get().iterkeys())
982 print ' %9d' % s[k][1],
988 if line % banner_repeat == 0:
994 """Returns processed program arguments."""
995 description_text = """
996 This script displays various statistics about VMs running under KVM.
997 The statistics are gathered from the KVM debugfs entries and / or the
998 currently available perf traces.
1000 The monitoring takes additional cpu cycles and might affect the VM's
1005 /sys/kernel/debug/kvm
1006 /sys/kernel/debug/trace/events/*
1008 - /proc/sys/kernel/perf_event_paranoid < 1 if user has no
1009 CAP_SYS_ADMIN and perf events are used.
1010 - CAP_SYS_RESOURCE if the hard limit is not high enough to allow
1011 the large number of files that are possibly opened.
1014 class PlainHelpFormatter(optparse.IndentedHelpFormatter):
1015 def format_description(self, description):
1017 return description + "\n"
1021 optparser = optparse.OptionParser(description=description_text,
1022 formatter=PlainHelpFormatter())
1023 optparser.add_option('-1', '--once', '--batch',
1024 action='store_true',
1027 help='run in batch mode for one second',
1029 optparser.add_option('-l', '--log',
1030 action='store_true',
1033 help='run in logging mode (like vmstat)',
1035 optparser.add_option('-t', '--tracepoints',
1036 action='store_true',
1039 help='retrieve statistics from tracepoints',
1041 optparser.add_option('-d', '--debugfs',
1042 action='store_true',
1045 help='retrieve statistics from debugfs',
1047 optparser.add_option('-f', '--fields',
1051 help='fields to display (regex)',
1053 optparser.add_option('-p', '--pid',
1058 help='restrict statistics to pid',
1060 (options, _) = optparser.parse_args(sys.argv)
1063 def get_providers(options):
1064 """Returns a list of data providers depending on the passed options."""
1067 if options.tracepoints:
1068 providers.append(TracepointProvider())
1070 providers.append(DebugfsProvider())
1071 if len(providers) == 0:
1072 providers.append(TracepointProvider())
1076 def check_access(options):
1077 """Exits if the current user can't access all needed directories."""
1078 if not os.path.exists('/sys/kernel/debug'):
1079 sys.stderr.write('Please enable CONFIG_DEBUG_FS in your kernel.')
1082 if not os.path.exists(PATH_DEBUGFS_KVM):
1083 sys.stderr.write("Please make sure, that debugfs is mounted and "
1084 "readable by the current user:\n"
1085 "('mount -t debugfs debugfs /sys/kernel/debug')\n"
1086 "Also ensure, that the kvm modules are loaded.\n")
1089 if not os.path.exists(PATH_DEBUGFS_TRACING) and (options.tracepoints
1090 or not options.debugfs):
1091 sys.stderr.write("Please enable CONFIG_TRACING in your kernel "
1092 "when using the option -t (default).\n"
1093 "If it is enabled, make {0} readable by the "
1095 .format(PATH_DEBUGFS_TRACING))
1096 if options.tracepoints:
1099 sys.stderr.write("Falling back to debugfs statistics!\n")
1100 options.debugfs = True
1106 options = get_options()
1107 options = check_access(options)
1109 if (options.pid > 0 and
1110 not os.path.isdir(os.path.join('/proc/',
1111 str(options.pid)))):
1112 sys.stderr.write('Did you use a (unsupported) tid instead of a pid?\n')
1113 sys.exit('Specified pid does not exist.')
1115 providers = get_providers(options)
1116 stats = Stats(providers, options.pid, fields=options.fields)
1120 elif not options.once:
1121 with Tui(stats) as tui:
1126 if __name__ == "__main__":