GNU Linux-libre 4.14.259-gnu1
[releases.git] / drivers / md / dm-stats.c
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/errno.h>
3 #include <linux/numa.h>
4 #include <linux/slab.h>
5 #include <linux/rculist.h>
6 #include <linux/threads.h>
7 #include <linux/preempt.h>
8 #include <linux/irqflags.h>
9 #include <linux/vmalloc.h>
10 #include <linux/mm.h>
11 #include <linux/module.h>
12 #include <linux/device-mapper.h>
13
14 #include "dm-core.h"
15 #include "dm-stats.h"
16
17 #define DM_MSG_PREFIX "stats"
18
19 static int dm_stat_need_rcu_barrier;
20
21 /*
22  * Using 64-bit values to avoid overflow (which is a
23  * problem that block/genhd.c's IO accounting has).
24  */
25 struct dm_stat_percpu {
26         unsigned long long sectors[2];
27         unsigned long long ios[2];
28         unsigned long long merges[2];
29         unsigned long long ticks[2];
30         unsigned long long io_ticks[2];
31         unsigned long long io_ticks_total;
32         unsigned long long time_in_queue;
33         unsigned long long *histogram;
34 };
35
36 struct dm_stat_shared {
37         atomic_t in_flight[2];
38         unsigned long long stamp;
39         struct dm_stat_percpu tmp;
40 };
41
42 struct dm_stat {
43         struct list_head list_entry;
44         int id;
45         unsigned stat_flags;
46         size_t n_entries;
47         sector_t start;
48         sector_t end;
49         sector_t step;
50         unsigned n_histogram_entries;
51         unsigned long long *histogram_boundaries;
52         const char *program_id;
53         const char *aux_data;
54         struct rcu_head rcu_head;
55         size_t shared_alloc_size;
56         size_t percpu_alloc_size;
57         size_t histogram_alloc_size;
58         struct dm_stat_percpu *stat_percpu[NR_CPUS];
59         struct dm_stat_shared stat_shared[0];
60 };
61
62 #define STAT_PRECISE_TIMESTAMPS         1
63
64 struct dm_stats_last_position {
65         sector_t last_sector;
66         unsigned last_rw;
67 };
68
69 /*
70  * A typo on the command line could possibly make the kernel run out of memory
71  * and crash. To prevent the crash we account all used memory. We fail if we
72  * exhaust 1/4 of all memory or 1/2 of vmalloc space.
73  */
74 #define DM_STATS_MEMORY_FACTOR          4
75 #define DM_STATS_VMALLOC_FACTOR         2
76
77 static DEFINE_SPINLOCK(shared_memory_lock);
78
79 static unsigned long shared_memory_amount;
80
81 static bool __check_shared_memory(size_t alloc_size)
82 {
83         size_t a;
84
85         a = shared_memory_amount + alloc_size;
86         if (a < shared_memory_amount)
87                 return false;
88         if (a >> PAGE_SHIFT > totalram_pages / DM_STATS_MEMORY_FACTOR)
89                 return false;
90 #ifdef CONFIG_MMU
91         if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR)
92                 return false;
93 #endif
94         return true;
95 }
96
97 static bool check_shared_memory(size_t alloc_size)
98 {
99         bool ret;
100
101         spin_lock_irq(&shared_memory_lock);
102
103         ret = __check_shared_memory(alloc_size);
104
105         spin_unlock_irq(&shared_memory_lock);
106
107         return ret;
108 }
109
110 static bool claim_shared_memory(size_t alloc_size)
111 {
112         spin_lock_irq(&shared_memory_lock);
113
114         if (!__check_shared_memory(alloc_size)) {
115                 spin_unlock_irq(&shared_memory_lock);
116                 return false;
117         }
118
119         shared_memory_amount += alloc_size;
120
121         spin_unlock_irq(&shared_memory_lock);
122
123         return true;
124 }
125
126 static void free_shared_memory(size_t alloc_size)
127 {
128         unsigned long flags;
129
130         spin_lock_irqsave(&shared_memory_lock, flags);
131
132         if (WARN_ON_ONCE(shared_memory_amount < alloc_size)) {
133                 spin_unlock_irqrestore(&shared_memory_lock, flags);
134                 DMCRIT("Memory usage accounting bug.");
135                 return;
136         }
137
138         shared_memory_amount -= alloc_size;
139
140         spin_unlock_irqrestore(&shared_memory_lock, flags);
141 }
142
143 static void *dm_kvzalloc(size_t alloc_size, int node)
144 {
145         void *p;
146
147         if (!claim_shared_memory(alloc_size))
148                 return NULL;
149
150         p = kvzalloc_node(alloc_size, GFP_KERNEL | __GFP_NOMEMALLOC, node);
151         if (p)
152                 return p;
153
154         free_shared_memory(alloc_size);
155
156         return NULL;
157 }
158
159 static void dm_kvfree(void *ptr, size_t alloc_size)
160 {
161         if (!ptr)
162                 return;
163
164         free_shared_memory(alloc_size);
165
166         kvfree(ptr);
167 }
168
169 static void dm_stat_free(struct rcu_head *head)
170 {
171         int cpu;
172         struct dm_stat *s = container_of(head, struct dm_stat, rcu_head);
173
174         kfree(s->histogram_boundaries);
175         kfree(s->program_id);
176         kfree(s->aux_data);
177         for_each_possible_cpu(cpu) {
178                 dm_kvfree(s->stat_percpu[cpu][0].histogram, s->histogram_alloc_size);
179                 dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size);
180         }
181         dm_kvfree(s->stat_shared[0].tmp.histogram, s->histogram_alloc_size);
182         dm_kvfree(s, s->shared_alloc_size);
183 }
184
185 static int dm_stat_in_flight(struct dm_stat_shared *shared)
186 {
187         return atomic_read(&shared->in_flight[READ]) +
188                atomic_read(&shared->in_flight[WRITE]);
189 }
190
191 void dm_stats_init(struct dm_stats *stats)
192 {
193         int cpu;
194         struct dm_stats_last_position *last;
195
196         mutex_init(&stats->mutex);
197         INIT_LIST_HEAD(&stats->list);
198         stats->last = alloc_percpu(struct dm_stats_last_position);
199         for_each_possible_cpu(cpu) {
200                 last = per_cpu_ptr(stats->last, cpu);
201                 last->last_sector = (sector_t)ULLONG_MAX;
202                 last->last_rw = UINT_MAX;
203         }
204 }
205
206 void dm_stats_cleanup(struct dm_stats *stats)
207 {
208         size_t ni;
209         struct dm_stat *s;
210         struct dm_stat_shared *shared;
211
212         while (!list_empty(&stats->list)) {
213                 s = container_of(stats->list.next, struct dm_stat, list_entry);
214                 list_del(&s->list_entry);
215                 for (ni = 0; ni < s->n_entries; ni++) {
216                         shared = &s->stat_shared[ni];
217                         if (WARN_ON(dm_stat_in_flight(shared))) {
218                                 DMCRIT("leaked in-flight counter at index %lu "
219                                        "(start %llu, end %llu, step %llu): reads %d, writes %d",
220                                        (unsigned long)ni,
221                                        (unsigned long long)s->start,
222                                        (unsigned long long)s->end,
223                                        (unsigned long long)s->step,
224                                        atomic_read(&shared->in_flight[READ]),
225                                        atomic_read(&shared->in_flight[WRITE]));
226                         }
227                 }
228                 dm_stat_free(&s->rcu_head);
229         }
230         free_percpu(stats->last);
231 }
232
233 static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
234                            sector_t step, unsigned stat_flags,
235                            unsigned n_histogram_entries,
236                            unsigned long long *histogram_boundaries,
237                            const char *program_id, const char *aux_data,
238                            void (*suspend_callback)(struct mapped_device *),
239                            void (*resume_callback)(struct mapped_device *),
240                            struct mapped_device *md)
241 {
242         struct list_head *l;
243         struct dm_stat *s, *tmp_s;
244         sector_t n_entries;
245         size_t ni;
246         size_t shared_alloc_size;
247         size_t percpu_alloc_size;
248         size_t histogram_alloc_size;
249         struct dm_stat_percpu *p;
250         int cpu;
251         int ret_id;
252         int r;
253
254         if (end < start || !step)
255                 return -EINVAL;
256
257         n_entries = end - start;
258         if (dm_sector_div64(n_entries, step))
259                 n_entries++;
260
261         if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1))
262                 return -EOVERFLOW;
263
264         shared_alloc_size = sizeof(struct dm_stat) + (size_t)n_entries * sizeof(struct dm_stat_shared);
265         if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries)
266                 return -EOVERFLOW;
267
268         percpu_alloc_size = (size_t)n_entries * sizeof(struct dm_stat_percpu);
269         if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries)
270                 return -EOVERFLOW;
271
272         histogram_alloc_size = (n_histogram_entries + 1) * (size_t)n_entries * sizeof(unsigned long long);
273         if (histogram_alloc_size / (n_histogram_entries + 1) != (size_t)n_entries * sizeof(unsigned long long))
274                 return -EOVERFLOW;
275
276         if (!check_shared_memory(shared_alloc_size + histogram_alloc_size +
277                                  num_possible_cpus() * (percpu_alloc_size + histogram_alloc_size)))
278                 return -ENOMEM;
279
280         s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE);
281         if (!s)
282                 return -ENOMEM;
283
284         s->stat_flags = stat_flags;
285         s->n_entries = n_entries;
286         s->start = start;
287         s->end = end;
288         s->step = step;
289         s->shared_alloc_size = shared_alloc_size;
290         s->percpu_alloc_size = percpu_alloc_size;
291         s->histogram_alloc_size = histogram_alloc_size;
292
293         s->n_histogram_entries = n_histogram_entries;
294         s->histogram_boundaries = kmemdup(histogram_boundaries,
295                                           s->n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
296         if (!s->histogram_boundaries) {
297                 r = -ENOMEM;
298                 goto out;
299         }
300
301         s->program_id = kstrdup(program_id, GFP_KERNEL);
302         if (!s->program_id) {
303                 r = -ENOMEM;
304                 goto out;
305         }
306         s->aux_data = kstrdup(aux_data, GFP_KERNEL);
307         if (!s->aux_data) {
308                 r = -ENOMEM;
309                 goto out;
310         }
311
312         for (ni = 0; ni < n_entries; ni++) {
313                 atomic_set(&s->stat_shared[ni].in_flight[READ], 0);
314                 atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0);
315         }
316
317         if (s->n_histogram_entries) {
318                 unsigned long long *hi;
319                 hi = dm_kvzalloc(s->histogram_alloc_size, NUMA_NO_NODE);
320                 if (!hi) {
321                         r = -ENOMEM;
322                         goto out;
323                 }
324                 for (ni = 0; ni < n_entries; ni++) {
325                         s->stat_shared[ni].tmp.histogram = hi;
326                         hi += s->n_histogram_entries + 1;
327                 }
328         }
329
330         for_each_possible_cpu(cpu) {
331                 p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu));
332                 if (!p) {
333                         r = -ENOMEM;
334                         goto out;
335                 }
336                 s->stat_percpu[cpu] = p;
337                 if (s->n_histogram_entries) {
338                         unsigned long long *hi;
339                         hi = dm_kvzalloc(s->histogram_alloc_size, cpu_to_node(cpu));
340                         if (!hi) {
341                                 r = -ENOMEM;
342                                 goto out;
343                         }
344                         for (ni = 0; ni < n_entries; ni++) {
345                                 p[ni].histogram = hi;
346                                 hi += s->n_histogram_entries + 1;
347                         }
348                 }
349         }
350
351         /*
352          * Suspend/resume to make sure there is no i/o in flight,
353          * so that newly created statistics will be exact.
354          *
355          * (note: we couldn't suspend earlier because we must not
356          * allocate memory while suspended)
357          */
358         suspend_callback(md);
359
360         mutex_lock(&stats->mutex);
361         s->id = 0;
362         list_for_each(l, &stats->list) {
363                 tmp_s = container_of(l, struct dm_stat, list_entry);
364                 if (WARN_ON(tmp_s->id < s->id)) {
365                         r = -EINVAL;
366                         goto out_unlock_resume;
367                 }
368                 if (tmp_s->id > s->id)
369                         break;
370                 if (unlikely(s->id == INT_MAX)) {
371                         r = -ENFILE;
372                         goto out_unlock_resume;
373                 }
374                 s->id++;
375         }
376         ret_id = s->id;
377         list_add_tail_rcu(&s->list_entry, l);
378         mutex_unlock(&stats->mutex);
379
380         resume_callback(md);
381
382         return ret_id;
383
384 out_unlock_resume:
385         mutex_unlock(&stats->mutex);
386         resume_callback(md);
387 out:
388         dm_stat_free(&s->rcu_head);
389         return r;
390 }
391
392 static struct dm_stat *__dm_stats_find(struct dm_stats *stats, int id)
393 {
394         struct dm_stat *s;
395
396         list_for_each_entry(s, &stats->list, list_entry) {
397                 if (s->id > id)
398                         break;
399                 if (s->id == id)
400                         return s;
401         }
402
403         return NULL;
404 }
405
406 static int dm_stats_delete(struct dm_stats *stats, int id)
407 {
408         struct dm_stat *s;
409         int cpu;
410
411         mutex_lock(&stats->mutex);
412
413         s = __dm_stats_find(stats, id);
414         if (!s) {
415                 mutex_unlock(&stats->mutex);
416                 return -ENOENT;
417         }
418
419         list_del_rcu(&s->list_entry);
420         mutex_unlock(&stats->mutex);
421
422         /*
423          * vfree can't be called from RCU callback
424          */
425         for_each_possible_cpu(cpu)
426                 if (is_vmalloc_addr(s->stat_percpu) ||
427                     is_vmalloc_addr(s->stat_percpu[cpu][0].histogram))
428                         goto do_sync_free;
429         if (is_vmalloc_addr(s) ||
430             is_vmalloc_addr(s->stat_shared[0].tmp.histogram)) {
431 do_sync_free:
432                 synchronize_rcu_expedited();
433                 dm_stat_free(&s->rcu_head);
434         } else {
435                 ACCESS_ONCE(dm_stat_need_rcu_barrier) = 1;
436                 call_rcu(&s->rcu_head, dm_stat_free);
437         }
438         return 0;
439 }
440
441 static int dm_stats_list(struct dm_stats *stats, const char *program,
442                          char *result, unsigned maxlen)
443 {
444         struct dm_stat *s;
445         sector_t len;
446         unsigned sz = 0;
447
448         /*
449          * Output format:
450          *   <region_id>: <start_sector>+<length> <step> <program_id> <aux_data>
451          */
452
453         mutex_lock(&stats->mutex);
454         list_for_each_entry(s, &stats->list, list_entry) {
455                 if (!program || !strcmp(program, s->program_id)) {
456                         len = s->end - s->start;
457                         DMEMIT("%d: %llu+%llu %llu %s %s", s->id,
458                                 (unsigned long long)s->start,
459                                 (unsigned long long)len,
460                                 (unsigned long long)s->step,
461                                 s->program_id,
462                                 s->aux_data);
463                         if (s->stat_flags & STAT_PRECISE_TIMESTAMPS)
464                                 DMEMIT(" precise_timestamps");
465                         if (s->n_histogram_entries) {
466                                 unsigned i;
467                                 DMEMIT(" histogram:");
468                                 for (i = 0; i < s->n_histogram_entries; i++) {
469                                         if (i)
470                                                 DMEMIT(",");
471                                         DMEMIT("%llu", s->histogram_boundaries[i]);
472                                 }
473                         }
474                         DMEMIT("\n");
475                 }
476         }
477         mutex_unlock(&stats->mutex);
478
479         return 1;
480 }
481
482 static void dm_stat_round(struct dm_stat *s, struct dm_stat_shared *shared,
483                           struct dm_stat_percpu *p)
484 {
485         /*
486          * This is racy, but so is part_round_stats_single.
487          */
488         unsigned long long now, difference;
489         unsigned in_flight_read, in_flight_write;
490
491         if (likely(!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)))
492                 now = jiffies;
493         else
494                 now = ktime_to_ns(ktime_get());
495
496         difference = now - shared->stamp;
497         if (!difference)
498                 return;
499
500         in_flight_read = (unsigned)atomic_read(&shared->in_flight[READ]);
501         in_flight_write = (unsigned)atomic_read(&shared->in_flight[WRITE]);
502         if (in_flight_read)
503                 p->io_ticks[READ] += difference;
504         if (in_flight_write)
505                 p->io_ticks[WRITE] += difference;
506         if (in_flight_read + in_flight_write) {
507                 p->io_ticks_total += difference;
508                 p->time_in_queue += (in_flight_read + in_flight_write) * difference;
509         }
510         shared->stamp = now;
511 }
512
513 static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
514                               int idx, sector_t len,
515                               struct dm_stats_aux *stats_aux, bool end,
516                               unsigned long duration_jiffies)
517 {
518         struct dm_stat_shared *shared = &s->stat_shared[entry];
519         struct dm_stat_percpu *p;
520
521         /*
522          * For strict correctness we should use local_irq_save/restore
523          * instead of preempt_disable/enable.
524          *
525          * preempt_disable/enable is racy if the driver finishes bios
526          * from non-interrupt context as well as from interrupt context
527          * or from more different interrupts.
528          *
529          * On 64-bit architectures the race only results in not counting some
530          * events, so it is acceptable.  On 32-bit architectures the race could
531          * cause the counter going off by 2^32, so we need to do proper locking
532          * there.
533          *
534          * part_stat_lock()/part_stat_unlock() have this race too.
535          */
536 #if BITS_PER_LONG == 32
537         unsigned long flags;
538         local_irq_save(flags);
539 #else
540         preempt_disable();
541 #endif
542         p = &s->stat_percpu[smp_processor_id()][entry];
543
544         if (!end) {
545                 dm_stat_round(s, shared, p);
546                 atomic_inc(&shared->in_flight[idx]);
547         } else {
548                 unsigned long long duration;
549                 dm_stat_round(s, shared, p);
550                 atomic_dec(&shared->in_flight[idx]);
551                 p->sectors[idx] += len;
552                 p->ios[idx] += 1;
553                 p->merges[idx] += stats_aux->merged;
554                 if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)) {
555                         p->ticks[idx] += duration_jiffies;
556                         duration = jiffies_to_msecs(duration_jiffies);
557                 } else {
558                         p->ticks[idx] += stats_aux->duration_ns;
559                         duration = stats_aux->duration_ns;
560                 }
561                 if (s->n_histogram_entries) {
562                         unsigned lo = 0, hi = s->n_histogram_entries + 1;
563                         while (lo + 1 < hi) {
564                                 unsigned mid = (lo + hi) / 2;
565                                 if (s->histogram_boundaries[mid - 1] > duration) {
566                                         hi = mid;
567                                 } else {
568                                         lo = mid;
569                                 }
570
571                         }
572                         p->histogram[lo]++;
573                 }
574         }
575
576 #if BITS_PER_LONG == 32
577         local_irq_restore(flags);
578 #else
579         preempt_enable();
580 #endif
581 }
582
583 static void __dm_stat_bio(struct dm_stat *s, int bi_rw,
584                           sector_t bi_sector, sector_t end_sector,
585                           bool end, unsigned long duration_jiffies,
586                           struct dm_stats_aux *stats_aux)
587 {
588         sector_t rel_sector, offset, todo, fragment_len;
589         size_t entry;
590
591         if (end_sector <= s->start || bi_sector >= s->end)
592                 return;
593         if (unlikely(bi_sector < s->start)) {
594                 rel_sector = 0;
595                 todo = end_sector - s->start;
596         } else {
597                 rel_sector = bi_sector - s->start;
598                 todo = end_sector - bi_sector;
599         }
600         if (unlikely(end_sector > s->end))
601                 todo -= (end_sector - s->end);
602
603         offset = dm_sector_div64(rel_sector, s->step);
604         entry = rel_sector;
605         do {
606                 if (WARN_ON_ONCE(entry >= s->n_entries)) {
607                         DMCRIT("Invalid area access in region id %d", s->id);
608                         return;
609                 }
610                 fragment_len = todo;
611                 if (fragment_len > s->step - offset)
612                         fragment_len = s->step - offset;
613                 dm_stat_for_entry(s, entry, bi_rw, fragment_len,
614                                   stats_aux, end, duration_jiffies);
615                 todo -= fragment_len;
616                 entry++;
617                 offset = 0;
618         } while (unlikely(todo != 0));
619 }
620
621 void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
622                          sector_t bi_sector, unsigned bi_sectors, bool end,
623                          unsigned long duration_jiffies,
624                          struct dm_stats_aux *stats_aux)
625 {
626         struct dm_stat *s;
627         sector_t end_sector;
628         struct dm_stats_last_position *last;
629         bool got_precise_time;
630
631         if (unlikely(!bi_sectors))
632                 return;
633
634         end_sector = bi_sector + bi_sectors;
635
636         if (!end) {
637                 /*
638                  * A race condition can at worst result in the merged flag being
639                  * misrepresented, so we don't have to disable preemption here.
640                  */
641                 last = raw_cpu_ptr(stats->last);
642                 stats_aux->merged =
643                         (bi_sector == (ACCESS_ONCE(last->last_sector) &&
644                                        ((bi_rw == WRITE) ==
645                                         (ACCESS_ONCE(last->last_rw) == WRITE))
646                                        ));
647                 ACCESS_ONCE(last->last_sector) = end_sector;
648                 ACCESS_ONCE(last->last_rw) = bi_rw;
649         }
650
651         rcu_read_lock();
652
653         got_precise_time = false;
654         list_for_each_entry_rcu(s, &stats->list, list_entry) {
655                 if (s->stat_flags & STAT_PRECISE_TIMESTAMPS && !got_precise_time) {
656                         if (!end)
657                                 stats_aux->duration_ns = ktime_to_ns(ktime_get());
658                         else
659                                 stats_aux->duration_ns = ktime_to_ns(ktime_get()) - stats_aux->duration_ns;
660                         got_precise_time = true;
661                 }
662                 __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration_jiffies, stats_aux);
663         }
664
665         rcu_read_unlock();
666 }
667
668 static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared,
669                                                    struct dm_stat *s, size_t x)
670 {
671         int cpu;
672         struct dm_stat_percpu *p;
673
674         local_irq_disable();
675         p = &s->stat_percpu[smp_processor_id()][x];
676         dm_stat_round(s, shared, p);
677         local_irq_enable();
678
679         shared->tmp.sectors[READ] = 0;
680         shared->tmp.sectors[WRITE] = 0;
681         shared->tmp.ios[READ] = 0;
682         shared->tmp.ios[WRITE] = 0;
683         shared->tmp.merges[READ] = 0;
684         shared->tmp.merges[WRITE] = 0;
685         shared->tmp.ticks[READ] = 0;
686         shared->tmp.ticks[WRITE] = 0;
687         shared->tmp.io_ticks[READ] = 0;
688         shared->tmp.io_ticks[WRITE] = 0;
689         shared->tmp.io_ticks_total = 0;
690         shared->tmp.time_in_queue = 0;
691
692         if (s->n_histogram_entries)
693                 memset(shared->tmp.histogram, 0, (s->n_histogram_entries + 1) * sizeof(unsigned long long));
694
695         for_each_possible_cpu(cpu) {
696                 p = &s->stat_percpu[cpu][x];
697                 shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]);
698                 shared->tmp.sectors[WRITE] += ACCESS_ONCE(p->sectors[WRITE]);
699                 shared->tmp.ios[READ] += ACCESS_ONCE(p->ios[READ]);
700                 shared->tmp.ios[WRITE] += ACCESS_ONCE(p->ios[WRITE]);
701                 shared->tmp.merges[READ] += ACCESS_ONCE(p->merges[READ]);
702                 shared->tmp.merges[WRITE] += ACCESS_ONCE(p->merges[WRITE]);
703                 shared->tmp.ticks[READ] += ACCESS_ONCE(p->ticks[READ]);
704                 shared->tmp.ticks[WRITE] += ACCESS_ONCE(p->ticks[WRITE]);
705                 shared->tmp.io_ticks[READ] += ACCESS_ONCE(p->io_ticks[READ]);
706                 shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]);
707                 shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total);
708                 shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue);
709                 if (s->n_histogram_entries) {
710                         unsigned i;
711                         for (i = 0; i < s->n_histogram_entries + 1; i++)
712                                 shared->tmp.histogram[i] += ACCESS_ONCE(p->histogram[i]);
713                 }
714         }
715 }
716
717 static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end,
718                             bool init_tmp_percpu_totals)
719 {
720         size_t x;
721         struct dm_stat_shared *shared;
722         struct dm_stat_percpu *p;
723
724         for (x = idx_start; x < idx_end; x++) {
725                 shared = &s->stat_shared[x];
726                 if (init_tmp_percpu_totals)
727                         __dm_stat_init_temporary_percpu_totals(shared, s, x);
728                 local_irq_disable();
729                 p = &s->stat_percpu[smp_processor_id()][x];
730                 p->sectors[READ] -= shared->tmp.sectors[READ];
731                 p->sectors[WRITE] -= shared->tmp.sectors[WRITE];
732                 p->ios[READ] -= shared->tmp.ios[READ];
733                 p->ios[WRITE] -= shared->tmp.ios[WRITE];
734                 p->merges[READ] -= shared->tmp.merges[READ];
735                 p->merges[WRITE] -= shared->tmp.merges[WRITE];
736                 p->ticks[READ] -= shared->tmp.ticks[READ];
737                 p->ticks[WRITE] -= shared->tmp.ticks[WRITE];
738                 p->io_ticks[READ] -= shared->tmp.io_ticks[READ];
739                 p->io_ticks[WRITE] -= shared->tmp.io_ticks[WRITE];
740                 p->io_ticks_total -= shared->tmp.io_ticks_total;
741                 p->time_in_queue -= shared->tmp.time_in_queue;
742                 local_irq_enable();
743                 if (s->n_histogram_entries) {
744                         unsigned i;
745                         for (i = 0; i < s->n_histogram_entries + 1; i++) {
746                                 local_irq_disable();
747                                 p = &s->stat_percpu[smp_processor_id()][x];
748                                 p->histogram[i] -= shared->tmp.histogram[i];
749                                 local_irq_enable();
750                         }
751                 }
752         }
753 }
754
755 static int dm_stats_clear(struct dm_stats *stats, int id)
756 {
757         struct dm_stat *s;
758
759         mutex_lock(&stats->mutex);
760
761         s = __dm_stats_find(stats, id);
762         if (!s) {
763                 mutex_unlock(&stats->mutex);
764                 return -ENOENT;
765         }
766
767         __dm_stat_clear(s, 0, s->n_entries, true);
768
769         mutex_unlock(&stats->mutex);
770
771         return 1;
772 }
773
774 /*
775  * This is like jiffies_to_msec, but works for 64-bit values.
776  */
777 static unsigned long long dm_jiffies_to_msec64(struct dm_stat *s, unsigned long long j)
778 {
779         unsigned long long result;
780         unsigned mult;
781
782         if (s->stat_flags & STAT_PRECISE_TIMESTAMPS)
783                 return j;
784
785         result = 0;
786         if (j)
787                 result = jiffies_to_msecs(j & 0x3fffff);
788         if (j >= 1 << 22) {
789                 mult = jiffies_to_msecs(1 << 22);
790                 result += (unsigned long long)mult * (unsigned long long)jiffies_to_msecs((j >> 22) & 0x3fffff);
791         }
792         if (j >= 1ULL << 44)
793                 result += (unsigned long long)mult * (unsigned long long)mult * (unsigned long long)jiffies_to_msecs(j >> 44);
794
795         return result;
796 }
797
798 static int dm_stats_print(struct dm_stats *stats, int id,
799                           size_t idx_start, size_t idx_len,
800                           bool clear, char *result, unsigned maxlen)
801 {
802         unsigned sz = 0;
803         struct dm_stat *s;
804         size_t x;
805         sector_t start, end, step;
806         size_t idx_end;
807         struct dm_stat_shared *shared;
808
809         /*
810          * Output format:
811          *   <start_sector>+<length> counters
812          */
813
814         mutex_lock(&stats->mutex);
815
816         s = __dm_stats_find(stats, id);
817         if (!s) {
818                 mutex_unlock(&stats->mutex);
819                 return -ENOENT;
820         }
821
822         idx_end = idx_start + idx_len;
823         if (idx_end < idx_start ||
824             idx_end > s->n_entries)
825                 idx_end = s->n_entries;
826
827         if (idx_start > idx_end)
828                 idx_start = idx_end;
829
830         step = s->step;
831         start = s->start + (step * idx_start);
832
833         for (x = idx_start; x < idx_end; x++, start = end) {
834                 shared = &s->stat_shared[x];
835                 end = start + step;
836                 if (unlikely(end > s->end))
837                         end = s->end;
838
839                 __dm_stat_init_temporary_percpu_totals(shared, s, x);
840
841                 DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu",
842                        (unsigned long long)start,
843                        (unsigned long long)step,
844                        shared->tmp.ios[READ],
845                        shared->tmp.merges[READ],
846                        shared->tmp.sectors[READ],
847                        dm_jiffies_to_msec64(s, shared->tmp.ticks[READ]),
848                        shared->tmp.ios[WRITE],
849                        shared->tmp.merges[WRITE],
850                        shared->tmp.sectors[WRITE],
851                        dm_jiffies_to_msec64(s, shared->tmp.ticks[WRITE]),
852                        dm_stat_in_flight(shared),
853                        dm_jiffies_to_msec64(s, shared->tmp.io_ticks_total),
854                        dm_jiffies_to_msec64(s, shared->tmp.time_in_queue),
855                        dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]),
856                        dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE]));
857                 if (s->n_histogram_entries) {
858                         unsigned i;
859                         for (i = 0; i < s->n_histogram_entries + 1; i++) {
860                                 DMEMIT("%s%llu", !i ? " " : ":", shared->tmp.histogram[i]);
861                         }
862                 }
863                 DMEMIT("\n");
864
865                 if (unlikely(sz + 1 >= maxlen))
866                         goto buffer_overflow;
867         }
868
869         if (clear)
870                 __dm_stat_clear(s, idx_start, idx_end, false);
871
872 buffer_overflow:
873         mutex_unlock(&stats->mutex);
874
875         return 1;
876 }
877
878 static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data)
879 {
880         struct dm_stat *s;
881         const char *new_aux_data;
882
883         mutex_lock(&stats->mutex);
884
885         s = __dm_stats_find(stats, id);
886         if (!s) {
887                 mutex_unlock(&stats->mutex);
888                 return -ENOENT;
889         }
890
891         new_aux_data = kstrdup(aux_data, GFP_KERNEL);
892         if (!new_aux_data) {
893                 mutex_unlock(&stats->mutex);
894                 return -ENOMEM;
895         }
896
897         kfree(s->aux_data);
898         s->aux_data = new_aux_data;
899
900         mutex_unlock(&stats->mutex);
901
902         return 0;
903 }
904
905 static int parse_histogram(const char *h, unsigned *n_histogram_entries,
906                            unsigned long long **histogram_boundaries)
907 {
908         const char *q;
909         unsigned n;
910         unsigned long long last;
911
912         *n_histogram_entries = 1;
913         for (q = h; *q; q++)
914                 if (*q == ',')
915                         (*n_histogram_entries)++;
916
917         *histogram_boundaries = kmalloc(*n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
918         if (!*histogram_boundaries)
919                 return -ENOMEM;
920
921         n = 0;
922         last = 0;
923         while (1) {
924                 unsigned long long hi;
925                 int s;
926                 char ch;
927                 s = sscanf(h, "%llu%c", &hi, &ch);
928                 if (!s || (s == 2 && ch != ','))
929                         return -EINVAL;
930                 if (hi <= last)
931                         return -EINVAL;
932                 last = hi;
933                 (*histogram_boundaries)[n] = hi;
934                 if (s == 1)
935                         return 0;
936                 h = strchr(h, ',') + 1;
937                 n++;
938         }
939 }
940
941 static int message_stats_create(struct mapped_device *md,
942                                 unsigned argc, char **argv,
943                                 char *result, unsigned maxlen)
944 {
945         int r;
946         int id;
947         char dummy;
948         unsigned long long start, end, len, step;
949         unsigned divisor;
950         const char *program_id, *aux_data;
951         unsigned stat_flags = 0;
952
953         unsigned n_histogram_entries = 0;
954         unsigned long long *histogram_boundaries = NULL;
955
956         struct dm_arg_set as, as_backup;
957         const char *a;
958         unsigned feature_args;
959
960         /*
961          * Input format:
962          *   <range> <step> [<extra_parameters> <parameters>] [<program_id> [<aux_data>]]
963          */
964
965         if (argc < 3)
966                 goto ret_einval;
967
968         as.argc = argc;
969         as.argv = argv;
970         dm_consume_args(&as, 1);
971
972         a = dm_shift_arg(&as);
973         if (!strcmp(a, "-")) {
974                 start = 0;
975                 len = dm_get_size(md);
976                 if (!len)
977                         len = 1;
978         } else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 ||
979                    start != (sector_t)start || len != (sector_t)len)
980                 goto ret_einval;
981
982         end = start + len;
983         if (start >= end)
984                 goto ret_einval;
985
986         a = dm_shift_arg(&as);
987         if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) {
988                 if (!divisor)
989                         return -EINVAL;
990                 step = end - start;
991                 if (do_div(step, divisor))
992                         step++;
993                 if (!step)
994                         step = 1;
995         } else if (sscanf(a, "%llu%c", &step, &dummy) != 1 ||
996                    step != (sector_t)step || !step)
997                 goto ret_einval;
998
999         as_backup = as;
1000         a = dm_shift_arg(&as);
1001         if (a && sscanf(a, "%u%c", &feature_args, &dummy) == 1) {
1002                 while (feature_args--) {
1003                         a = dm_shift_arg(&as);
1004                         if (!a)
1005                                 goto ret_einval;
1006                         if (!strcasecmp(a, "precise_timestamps"))
1007                                 stat_flags |= STAT_PRECISE_TIMESTAMPS;
1008                         else if (!strncasecmp(a, "histogram:", 10)) {
1009                                 if (n_histogram_entries)
1010                                         goto ret_einval;
1011                                 if ((r = parse_histogram(a + 10, &n_histogram_entries, &histogram_boundaries)))
1012                                         goto ret;
1013                         } else
1014                                 goto ret_einval;
1015                 }
1016         } else {
1017                 as = as_backup;
1018         }
1019
1020         program_id = "-";
1021         aux_data = "-";
1022
1023         a = dm_shift_arg(&as);
1024         if (a)
1025                 program_id = a;
1026
1027         a = dm_shift_arg(&as);
1028         if (a)
1029                 aux_data = a;
1030
1031         if (as.argc)
1032                 goto ret_einval;
1033
1034         /*
1035          * If a buffer overflow happens after we created the region,
1036          * it's too late (the userspace would retry with a larger
1037          * buffer, but the region id that caused the overflow is already
1038          * leaked).  So we must detect buffer overflow in advance.
1039          */
1040         snprintf(result, maxlen, "%d", INT_MAX);
1041         if (dm_message_test_buffer_overflow(result, maxlen)) {
1042                 r = 1;
1043                 goto ret;
1044         }
1045
1046         id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags,
1047                              n_histogram_entries, histogram_boundaries, program_id, aux_data,
1048                              dm_internal_suspend_fast, dm_internal_resume_fast, md);
1049         if (id < 0) {
1050                 r = id;
1051                 goto ret;
1052         }
1053
1054         snprintf(result, maxlen, "%d", id);
1055
1056         r = 1;
1057         goto ret;
1058
1059 ret_einval:
1060         r = -EINVAL;
1061 ret:
1062         kfree(histogram_boundaries);
1063         return r;
1064 }
1065
1066 static int message_stats_delete(struct mapped_device *md,
1067                                 unsigned argc, char **argv)
1068 {
1069         int id;
1070         char dummy;
1071
1072         if (argc != 2)
1073                 return -EINVAL;
1074
1075         if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
1076                 return -EINVAL;
1077
1078         return dm_stats_delete(dm_get_stats(md), id);
1079 }
1080
1081 static int message_stats_clear(struct mapped_device *md,
1082                                unsigned argc, char **argv)
1083 {
1084         int id;
1085         char dummy;
1086
1087         if (argc != 2)
1088                 return -EINVAL;
1089
1090         if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
1091                 return -EINVAL;
1092
1093         return dm_stats_clear(dm_get_stats(md), id);
1094 }
1095
1096 static int message_stats_list(struct mapped_device *md,
1097                               unsigned argc, char **argv,
1098                               char *result, unsigned maxlen)
1099 {
1100         int r;
1101         const char *program = NULL;
1102
1103         if (argc < 1 || argc > 2)
1104                 return -EINVAL;
1105
1106         if (argc > 1) {
1107                 program = kstrdup(argv[1], GFP_KERNEL);
1108                 if (!program)
1109                         return -ENOMEM;
1110         }
1111
1112         r = dm_stats_list(dm_get_stats(md), program, result, maxlen);
1113
1114         kfree(program);
1115
1116         return r;
1117 }
1118
1119 static int message_stats_print(struct mapped_device *md,
1120                                unsigned argc, char **argv, bool clear,
1121                                char *result, unsigned maxlen)
1122 {
1123         int id;
1124         char dummy;
1125         unsigned long idx_start = 0, idx_len = ULONG_MAX;
1126
1127         if (argc != 2 && argc != 4)
1128                 return -EINVAL;
1129
1130         if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
1131                 return -EINVAL;
1132
1133         if (argc > 3) {
1134                 if (strcmp(argv[2], "-") &&
1135                     sscanf(argv[2], "%lu%c", &idx_start, &dummy) != 1)
1136                         return -EINVAL;
1137                 if (strcmp(argv[3], "-") &&
1138                     sscanf(argv[3], "%lu%c", &idx_len, &dummy) != 1)
1139                         return -EINVAL;
1140         }
1141
1142         return dm_stats_print(dm_get_stats(md), id, idx_start, idx_len, clear,
1143                               result, maxlen);
1144 }
1145
1146 static int message_stats_set_aux(struct mapped_device *md,
1147                                  unsigned argc, char **argv)
1148 {
1149         int id;
1150         char dummy;
1151
1152         if (argc != 3)
1153                 return -EINVAL;
1154
1155         if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
1156                 return -EINVAL;
1157
1158         return dm_stats_set_aux(dm_get_stats(md), id, argv[2]);
1159 }
1160
1161 int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
1162                      char *result, unsigned maxlen)
1163 {
1164         int r;
1165
1166         /* All messages here must start with '@' */
1167         if (!strcasecmp(argv[0], "@stats_create"))
1168                 r = message_stats_create(md, argc, argv, result, maxlen);
1169         else if (!strcasecmp(argv[0], "@stats_delete"))
1170                 r = message_stats_delete(md, argc, argv);
1171         else if (!strcasecmp(argv[0], "@stats_clear"))
1172                 r = message_stats_clear(md, argc, argv);
1173         else if (!strcasecmp(argv[0], "@stats_list"))
1174                 r = message_stats_list(md, argc, argv, result, maxlen);
1175         else if (!strcasecmp(argv[0], "@stats_print"))
1176                 r = message_stats_print(md, argc, argv, false, result, maxlen);
1177         else if (!strcasecmp(argv[0], "@stats_print_clear"))
1178                 r = message_stats_print(md, argc, argv, true, result, maxlen);
1179         else if (!strcasecmp(argv[0], "@stats_set_aux"))
1180                 r = message_stats_set_aux(md, argc, argv);
1181         else
1182                 return 2; /* this wasn't a stats message */
1183
1184         if (r == -EINVAL)
1185                 DMWARN("Invalid parameters for message %s", argv[0]);
1186
1187         return r;
1188 }
1189
1190 int __init dm_statistics_init(void)
1191 {
1192         shared_memory_amount = 0;
1193         dm_stat_need_rcu_barrier = 0;
1194         return 0;
1195 }
1196
1197 void dm_statistics_exit(void)
1198 {
1199         if (dm_stat_need_rcu_barrier)
1200                 rcu_barrier();
1201         if (WARN_ON(shared_memory_amount))
1202                 DMCRIT("shared_memory_amount leaked: %lu", shared_memory_amount);
1203 }
1204
1205 module_param_named(stats_current_allocated_bytes, shared_memory_amount, ulong, S_IRUGO);
1206 MODULE_PARM_DESC(stats_current_allocated_bytes, "Memory currently used by statistics");