kernel/irq/timings.c

   1 /*
   2  * linux/kernel/irq/timings.c
   3  *
   4  * Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org>
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 as
   8  * published by the Free Software Foundation.
   9  *
  10  */
  11 #include <linux/kernel.h>
  12 #include <linux/percpu.h>
  13 #include <linux/slab.h>
  14 #include <linux/static_key.h>
  15 #include <linux/interrupt.h>
  16 #include <linux/idr.h>
  17 #include <linux/irq.h>
  18 #include <linux/math64.h>
  19
  20 #include <trace/events/irq.h>
  21
  22 #include "internals.h"
  23
  24 DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);
  25
  26 DEFINE_PER_CPU(struct irq_timings, irq_timings);
  27
  28 struct irqt_stat {
  29         u64     next_evt;
  30         u64     last_ts;
  31         u64     variance;
  32         u32     avg;
  33         u32     nr_samples;
  34         int     anomalies;
  35         int     valid;
  36 };
  37
  38 static DEFINE_IDR(irqt_stats);
  39
  40 void irq_timings_enable(void)
  41 {
  42         static_branch_enable(&irq_timing_enabled);
  43 }
  44
  45 void irq_timings_disable(void)
  46 {
  47         static_branch_disable(&irq_timing_enabled);
  48 }
  49
  50 /**
  51  * irqs_update - update the irq timing statistics with a new timestamp
  52  *
  53  * @irqs: an irqt_stat struct pointer
  54  * @ts: the new timestamp
  55  *
  56  * The statistics are computed online, in other words, the code is
  57  * designed to compute the statistics on a stream of values rather
  58  * than doing multiple passes on the values to compute the average,
  59  * then the variance. The integer division introduces a loss of
  60  * precision but with an acceptable error margin regarding the results
  61  * we would have with the double floating precision: we are dealing
  62  * with nanosec, so big numbers, consequently the mantisse is
  63  * negligeable, especially when converting the time in usec
  64  * afterwards.
  65  *
  66  * The computation happens at idle time. When the CPU is not idle, the
  67  * interrupts' timestamps are stored in the circular buffer, when the
  68  * CPU goes idle and this routine is called, all the buffer's values
  69  * are injected in the statistical model continuying to extend the
  70  * statistics from the previous busy-idle cycle.
  71  *
  72  * The observations showed a device will trigger a burst of periodic
  73  * interrupts followed by one or two peaks of longer time, for
  74  * instance when a SD card device flushes its cache, then the periodic
  75  * intervals occur again. A one second inactivity period resets the
  76  * stats, that gives us the certitude the statistical values won't
  77  * exceed 1x10^9, thus the computation won't overflow.
  78  *
  79  * Basically, the purpose of the algorithm is to watch the periodic
  80  * interrupts and eliminate the peaks.
  81  *
  82  * An interrupt is considered periodically stable if the interval of
  83  * its occurences follow the normal distribution, thus the values
  84  * comply with:
  85  *
  86  *      avg - 3 x stddev < value < avg + 3 x stddev
  87  *
  88  * Which can be simplified to:
  89  *
  90  *      -3 x stddev < value - avg < 3 x stddev
  91  *
  92  *      abs(value - avg) < 3 x stddev
  93  *
  94  * In order to save a costly square root computation, we use the
  95  * variance. For the record, stddev = sqrt(variance). The equation
  96  * above becomes:
  97  *
  98  *      abs(value - avg) < 3 x sqrt(variance)
  99  *
 100  * And finally we square it:
 101  *
 102  *      (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2
 103  *
 104  *      (value - avg) x (value - avg) < 9 x variance
 105  *
 106  * Statistically speaking, any values out of this interval is
 107  * considered as an anomaly and is discarded. However, a normal
 108  * distribution appears when the number of samples is 30 (it is the
 109  * rule of thumb in statistics, cf. "30 samples" on Internet). When
 110  * there are three consecutive anomalies, the statistics are resetted.
 111  *
 112  */
 113 static void irqs_update(struct irqt_stat *irqs, u64 ts)
 114 {
 115         u64 old_ts = irqs->last_ts;
 116         u64 variance = 0;
 117         u64 interval;
 118         s64 diff;
 119
 120         /*
 121          * The timestamps are absolute time values, we need to compute
 122          * the timing interval between two interrupts.
 123          */
 124         irqs->last_ts = ts;
 125
 126         /*
 127          * The interval type is u64 in order to deal with the same
 128          * type in our computation, that prevent mindfuck issues with
 129          * overflow, sign and division.
 130          */
 131         interval = ts - old_ts;
 132
 133         /*
 134          * The interrupt triggered more than one second apart, that
 135          * ends the sequence as predictible for our purpose. In this
 136          * case, assume we have the beginning of a sequence and the
 137          * timestamp is the first value. As it is impossible to
 138          * predict anything at this point, return.
 139          *
 140          * Note the first timestamp of the sequence will always fall
 141          * in this test because the old_ts is zero. That is what we
 142          * want as we need another timestamp to compute an interval.
 143          */
 144         if (interval >= NSEC_PER_SEC) {
 145                 memset(irqs, 0, sizeof(*irqs));
 146                 irqs->last_ts = ts;
 147                 return;
 148         }
 149
 150         /*
 151          * Pre-compute the delta with the average as the result is
 152          * used several times in this function.
 153          */
 154         diff = interval - irqs->avg;
 155
 156         /*
 157          * Increment the number of samples.
 158          */
 159         irqs->nr_samples++;
 160
 161         /*
 162          * Online variance divided by the number of elements if there
 163          * is more than one sample.  Normally the formula is division
 164          * by nr_samples - 1 but we assume the number of element will be
 165          * more than 32 and dividing by 32 instead of 31 is enough
 166          * precise.
 167          */
 168         if (likely(irqs->nr_samples > 1))
 169                 variance = irqs->variance >> IRQ_TIMINGS_SHIFT;
 170
 171         /*
 172          * The rule of thumb in statistics for the normal distribution
 173          * is having at least 30 samples in order to have the model to
 174          * apply. Values outside the interval are considered as an
 175          * anomaly.
 176          */
 177         if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) {
 178                 /*
 179                  * After three consecutive anomalies, we reset the
 180                  * stats as it is no longer stable enough.
 181                  */
 182                 if (irqs->anomalies++ >= 3) {
 183                         memset(irqs, 0, sizeof(*irqs));
 184                         irqs->last_ts = ts;
 185                         return;
 186                 }
 187         } else {
 188                 /*
 189                  * The anomalies must be consecutives, so at this
 190                  * point, we reset the anomalies counter.
 191                  */
 192                 irqs->anomalies = 0;
 193         }
 194
 195         /*
 196          * The interrupt is considered stable enough to try to predict
 197          * the next event on it.
 198          */
 199         irqs->valid = 1;
 200
 201         /*
 202          * Online average algorithm:
 203          *
 204          *  new_average = average + ((value - average) / count)
 205          *
 206          * The variance computation depends on the new average
 207          * to be computed here first.
 208          *
 209          */
 210         irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT);
 211
 212         /*
 213          * Online variance algorithm:
 214          *
 215          *  new_variance = variance + (value - average) x (value - new_average)
 216          *
 217          * Warning: irqs->avg is updated with the line above, hence
 218          * 'interval - irqs->avg' is no longer equal to 'diff'
 219          */
 220         irqs->variance = irqs->variance + (diff * (interval - irqs->avg));
 221
 222         /*
 223          * Update the next event
 224          */
 225         irqs->next_evt = ts + irqs->avg;
 226 }
 227
 228 /**
 229  * irq_timings_next_event - Return when the next event is supposed to arrive
 230  *
 231  * During the last busy cycle, the number of interrupts is incremented
 232  * and stored in the irq_timings structure. This information is
 233  * necessary to:
 234  *
 235  * - know if the index in the table wrapped up:
 236  *
 237  *      If more than the array size interrupts happened during the
 238  *      last busy/idle cycle, the index wrapped up and we have to
 239  *      begin with the next element in the array which is the last one
 240  *      in the sequence, otherwise it is a the index 0.
 241  *
 242  * - have an indication of the interrupts activity on this CPU
 243  *   (eg. irq/sec)
 244  *
 245  * The values are 'consumed' after inserting in the statistical model,
 246  * thus the count is reinitialized.
 247  *
 248  * The array of values **must** be browsed in the time direction, the
 249  * timestamp must increase between an element and the next one.
 250  *
 251  * Returns a nanosec time based estimation of the earliest interrupt,
 252  * U64_MAX otherwise.
 253  */
 254 u64 irq_timings_next_event(u64 now)
 255 {
 256         struct irq_timings *irqts = this_cpu_ptr(&irq_timings);
 257         struct irqt_stat *irqs;
 258         struct irqt_stat __percpu *s;
 259         u64 ts, next_evt = U64_MAX;
 260         int i, irq = 0;
 261
 262         /*
 263          * This function must be called with the local irq disabled in
 264          * order to prevent the timings circular buffer to be updated
 265          * while we are reading it.
 266          */
 267         WARN_ON_ONCE(!irqs_disabled());
 268
 269         /*
 270          * Number of elements in the circular buffer: If it happens it
 271          * was flushed before, then the number of elements could be
 272          * smaller than IRQ_TIMINGS_SIZE, so the count is used,
 273          * otherwise the array size is used as we wrapped. The index
 274          * begins from zero when we did not wrap. That could be done
 275          * in a nicer way with the proper circular array structure
 276          * type but with the cost of extra computation in the
 277          * interrupt handler hot path. We choose efficiency.
 278          *
 279          * Inject measured irq/timestamp to the statistical model
 280          * while decrementing the counter because we consume the data
 281          * from our circular buffer.
 282          */
 283         for (i = irqts->count & IRQ_TIMINGS_MASK,
 284                      irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
 285              irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
 286
 287                 irq = irq_timing_decode(irqts->values[i], &ts);
 288
 289                 s = idr_find(&irqt_stats, irq);
 290                 if (s) {
 291                         irqs = this_cpu_ptr(s);
 292                         irqs_update(irqs, ts);
 293                 }
 294         }
 295
 296         /*
 297          * Look in the list of interrupts' statistics, the earliest
 298          * next event.
 299          */
 300         idr_for_each_entry(&irqt_stats, s, i) {
 301
 302                 irqs = this_cpu_ptr(s);
 303
 304                 if (!irqs->valid)
 305                         continue;
 306
 307                 if (irqs->next_evt <= now) {
 308                         irq = i;
 309                         next_evt = now;
 310
 311                         /*
 312                          * This interrupt mustn't use in the future
 313                          * until new events occur and update the
 314                          * statistics.
 315                          */
 316                         irqs->valid = 0;
 317                         break;
 318                 }
 319
 320                 if (irqs->next_evt < next_evt) {
 321                         irq = i;
 322                         next_evt = irqs->next_evt;
 323                 }
 324         }
 325
 326         return next_evt;
 327 }
 328
 329 void irq_timings_free(int irq)
 330 {
 331         struct irqt_stat __percpu *s;
 332
 333         s = idr_find(&irqt_stats, irq);
 334         if (s) {
 335                 free_percpu(s);
 336                 idr_remove(&irqt_stats, irq);
 337         }
 338 }
 339
 340 int irq_timings_alloc(int irq)
 341 {
 342         struct irqt_stat __percpu *s;
 343         int id;
 344
 345         /*
 346          * Some platforms can have the same private interrupt per cpu,
 347          * so this function may be be called several times with the
 348          * same interrupt number. Just bail out in case the per cpu
 349          * stat structure is already allocated.
 350          */
 351         s = idr_find(&irqt_stats, irq);
 352         if (s)
 353                 return 0;
 354
 355         s = alloc_percpu(*s);
 356         if (!s)
 357                 return -ENOMEM;
 358
 359         idr_preload(GFP_KERNEL);
 360         id = idr_alloc(&irqt_stats, s, irq, irq + 1, GFP_NOWAIT);
 361         idr_preload_end();
 362
 363         if (id < 0) {
 364                 free_percpu(s);
 365                 return id;
 366         }
 367
 368         return 0;
 369 }