GNU Linux-libre 4.14.303-gnu1
[releases.git] / drivers / md / dm-integrity.c
1 /*
2  * Copyright (C) 2016-2017 Red Hat, Inc. All rights reserved.
3  * Copyright (C) 2016-2017 Milan Broz
4  * Copyright (C) 2016-2017 Mikulas Patocka
5  *
6  * This file is released under the GPL.
7  */
8
9 #include <linux/module.h>
10 #include <linux/device-mapper.h>
11 #include <linux/dm-io.h>
12 #include <linux/vmalloc.h>
13 #include <linux/sort.h>
14 #include <linux/rbtree.h>
15 #include <linux/delay.h>
16 #include <linux/random.h>
17 #include <crypto/hash.h>
18 #include <crypto/skcipher.h>
19 #include <linux/async_tx.h>
20 #include "dm-bufio.h"
21
22 #define DM_MSG_PREFIX "integrity"
23
24 #define DEFAULT_INTERLEAVE_SECTORS      32768
25 #define DEFAULT_JOURNAL_SIZE_FACTOR     7
26 #define DEFAULT_BUFFER_SECTORS          128
27 #define DEFAULT_JOURNAL_WATERMARK       50
28 #define DEFAULT_SYNC_MSEC               10000
29 #define DEFAULT_MAX_JOURNAL_SECTORS     131072
30 #define MIN_LOG2_INTERLEAVE_SECTORS     3
31 #define MAX_LOG2_INTERLEAVE_SECTORS     31
32 #define METADATA_WORKQUEUE_MAX_ACTIVE   16
33
34 /*
35  * Warning - DEBUG_PRINT prints security-sensitive data to the log,
36  * so it should not be enabled in the official kernel
37  */
38 //#define DEBUG_PRINT
39 //#define INTERNAL_VERIFY
40
41 /*
42  * On disk structures
43  */
44
45 #define SB_MAGIC                        "integrt"
46 #define SB_VERSION                      1
47 #define SB_SECTORS                      8
48 #define MAX_SECTORS_PER_BLOCK           8
49
50 struct superblock {
51         __u8 magic[8];
52         __u8 version;
53         __u8 log2_interleave_sectors;
54         __u16 integrity_tag_size;
55         __u32 journal_sections;
56         __u64 provided_data_sectors;    /* userspace uses this value */
57         __u32 flags;
58         __u8 log2_sectors_per_block;
59 };
60
61 #define SB_FLAG_HAVE_JOURNAL_MAC        0x1
62
63 #define JOURNAL_ENTRY_ROUNDUP           8
64
65 typedef __u64 commit_id_t;
66 #define JOURNAL_MAC_PER_SECTOR          8
67
68 struct journal_entry {
69         union {
70                 struct {
71                         __u32 sector_lo;
72                         __u32 sector_hi;
73                 } s;
74                 __u64 sector;
75         } u;
76         commit_id_t last_bytes[0];
77         /* __u8 tag[0]; */
78 };
79
80 #define journal_entry_tag(ic, je)               ((__u8 *)&(je)->last_bytes[(ic)->sectors_per_block])
81
82 #if BITS_PER_LONG == 64
83 #define journal_entry_set_sector(je, x)         do { smp_wmb(); ACCESS_ONCE((je)->u.sector) = cpu_to_le64(x); } while (0)
84 #define journal_entry_get_sector(je)            le64_to_cpu((je)->u.sector)
85 #elif defined(CONFIG_LBDAF)
86 #define journal_entry_set_sector(je, x)         do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32((x) >> 32); } while (0)
87 #define journal_entry_get_sector(je)            le64_to_cpu((je)->u.sector)
88 #else
89 #define journal_entry_set_sector(je, x)         do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32(0); } while (0)
90 #define journal_entry_get_sector(je)            le32_to_cpu((je)->u.s.sector_lo)
91 #endif
92 #define journal_entry_is_unused(je)             ((je)->u.s.sector_hi == cpu_to_le32(-1))
93 #define journal_entry_set_unused(je)            do { ((je)->u.s.sector_hi = cpu_to_le32(-1)); } while (0)
94 #define journal_entry_is_inprogress(je)         ((je)->u.s.sector_hi == cpu_to_le32(-2))
95 #define journal_entry_set_inprogress(je)        do { ((je)->u.s.sector_hi = cpu_to_le32(-2)); } while (0)
96
97 #define JOURNAL_BLOCK_SECTORS           8
98 #define JOURNAL_SECTOR_DATA             ((1 << SECTOR_SHIFT) - sizeof(commit_id_t))
99 #define JOURNAL_MAC_SIZE                (JOURNAL_MAC_PER_SECTOR * JOURNAL_BLOCK_SECTORS)
100
101 struct journal_sector {
102         __u8 entries[JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR];
103         __u8 mac[JOURNAL_MAC_PER_SECTOR];
104         commit_id_t commit_id;
105 };
106
107 #define MAX_TAG_SIZE                    (JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, last_bytes[MAX_SECTORS_PER_BLOCK]))
108
109 #define METADATA_PADDING_SECTORS        8
110
111 #define N_COMMIT_IDS                    4
112
113 static unsigned char prev_commit_seq(unsigned char seq)
114 {
115         return (seq + N_COMMIT_IDS - 1) % N_COMMIT_IDS;
116 }
117
118 static unsigned char next_commit_seq(unsigned char seq)
119 {
120         return (seq + 1) % N_COMMIT_IDS;
121 }
122
123 /*
124  * In-memory structures
125  */
126
127 struct journal_node {
128         struct rb_node node;
129         sector_t sector;
130 };
131
132 struct alg_spec {
133         char *alg_string;
134         char *key_string;
135         __u8 *key;
136         unsigned key_size;
137 };
138
139 struct dm_integrity_c {
140         struct dm_dev *dev;
141         unsigned tag_size;
142         __s8 log2_tag_size;
143         sector_t start;
144         mempool_t *journal_io_mempool;
145         struct dm_io_client *io;
146         struct dm_bufio_client *bufio;
147         struct workqueue_struct *metadata_wq;
148         struct superblock *sb;
149         unsigned journal_pages;
150         struct page_list *journal;
151         struct page_list *journal_io;
152         struct page_list *journal_xor;
153
154         struct crypto_skcipher *journal_crypt;
155         struct scatterlist **journal_scatterlist;
156         struct scatterlist **journal_io_scatterlist;
157         struct skcipher_request **sk_requests;
158
159         struct crypto_shash *journal_mac;
160
161         struct journal_node *journal_tree;
162         struct rb_root journal_tree_root;
163
164         sector_t provided_data_sectors;
165
166         unsigned short journal_entry_size;
167         unsigned char journal_entries_per_sector;
168         unsigned char journal_section_entries;
169         unsigned short journal_section_sectors;
170         unsigned journal_sections;
171         unsigned journal_entries;
172         sector_t device_sectors;
173         unsigned initial_sectors;
174         unsigned metadata_run;
175         __s8 log2_metadata_run;
176         __u8 log2_buffer_sectors;
177         __u8 sectors_per_block;
178
179         unsigned char mode;
180         int suspending;
181
182         int failed;
183
184         struct crypto_shash *internal_hash;
185
186         /* these variables are locked with endio_wait.lock */
187         struct rb_root in_progress;
188         wait_queue_head_t endio_wait;
189         struct workqueue_struct *wait_wq;
190         struct workqueue_struct *offload_wq;
191
192         unsigned char commit_seq;
193         commit_id_t commit_ids[N_COMMIT_IDS];
194
195         unsigned committed_section;
196         unsigned n_committed_sections;
197
198         unsigned uncommitted_section;
199         unsigned n_uncommitted_sections;
200
201         unsigned free_section;
202         unsigned char free_section_entry;
203         unsigned free_sectors;
204
205         unsigned free_sectors_threshold;
206
207         struct workqueue_struct *commit_wq;
208         struct work_struct commit_work;
209
210         struct workqueue_struct *writer_wq;
211         struct work_struct writer_work;
212
213         struct bio_list flush_bio_list;
214
215         unsigned long autocommit_jiffies;
216         struct timer_list autocommit_timer;
217         unsigned autocommit_msec;
218
219         wait_queue_head_t copy_to_journal_wait;
220
221         struct completion crypto_backoff;
222
223         bool journal_uptodate;
224         bool just_formatted;
225
226         struct alg_spec internal_hash_alg;
227         struct alg_spec journal_crypt_alg;
228         struct alg_spec journal_mac_alg;
229
230         atomic64_t number_of_mismatches;
231 };
232
233 struct dm_integrity_range {
234         sector_t logical_sector;
235         unsigned n_sectors;
236         struct rb_node node;
237 };
238
239 struct dm_integrity_io {
240         struct work_struct work;
241
242         struct dm_integrity_c *ic;
243         bool write;
244         bool fua;
245
246         struct dm_integrity_range range;
247
248         sector_t metadata_block;
249         unsigned metadata_offset;
250
251         atomic_t in_flight;
252         blk_status_t bi_status;
253
254         struct completion *completion;
255
256         struct gendisk *orig_bi_disk;
257         u8 orig_bi_partno;
258         bio_end_io_t *orig_bi_end_io;
259         struct bio_integrity_payload *orig_bi_integrity;
260         struct bvec_iter orig_bi_iter;
261 };
262
263 struct journal_completion {
264         struct dm_integrity_c *ic;
265         atomic_t in_flight;
266         struct completion comp;
267 };
268
269 struct journal_io {
270         struct dm_integrity_range range;
271         struct journal_completion *comp;
272 };
273
274 static struct kmem_cache *journal_io_cache;
275
276 #define JOURNAL_IO_MEMPOOL      32
277
278 #ifdef DEBUG_PRINT
279 #define DEBUG_print(x, ...)     printk(KERN_DEBUG x, ##__VA_ARGS__)
280 static void __DEBUG_bytes(__u8 *bytes, size_t len, const char *msg, ...)
281 {
282         va_list args;
283         va_start(args, msg);
284         vprintk(msg, args);
285         va_end(args);
286         if (len)
287                 pr_cont(":");
288         while (len) {
289                 pr_cont(" %02x", *bytes);
290                 bytes++;
291                 len--;
292         }
293         pr_cont("\n");
294 }
295 #define DEBUG_bytes(bytes, len, msg, ...)       __DEBUG_bytes(bytes, len, KERN_DEBUG msg, ##__VA_ARGS__)
296 #else
297 #define DEBUG_print(x, ...)                     do { } while (0)
298 #define DEBUG_bytes(bytes, len, msg, ...)       do { } while (0)
299 #endif
300
301 /*
302  * DM Integrity profile, protection is performed layer above (dm-crypt)
303  */
304 static const struct blk_integrity_profile dm_integrity_profile = {
305         .name                   = "DM-DIF-EXT-TAG",
306         .generate_fn            = NULL,
307         .verify_fn              = NULL,
308 };
309
310 static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map);
311 static void integrity_bio_wait(struct work_struct *w);
312 static void dm_integrity_dtr(struct dm_target *ti);
313
314 static void dm_integrity_io_error(struct dm_integrity_c *ic, const char *msg, int err)
315 {
316         if (err == -EILSEQ)
317                 atomic64_inc(&ic->number_of_mismatches);
318         if (!cmpxchg(&ic->failed, 0, err))
319                 DMERR("Error on %s: %d", msg, err);
320 }
321
322 static int dm_integrity_failed(struct dm_integrity_c *ic)
323 {
324         return ACCESS_ONCE(ic->failed);
325 }
326
327 static commit_id_t dm_integrity_commit_id(struct dm_integrity_c *ic, unsigned i,
328                                           unsigned j, unsigned char seq)
329 {
330         /*
331          * Xor the number with section and sector, so that if a piece of
332          * journal is written at wrong place, it is detected.
333          */
334         return ic->commit_ids[seq] ^ cpu_to_le64(((__u64)i << 32) ^ j);
335 }
336
337 static void get_area_and_offset(struct dm_integrity_c *ic, sector_t data_sector,
338                                 sector_t *area, sector_t *offset)
339 {
340         __u8 log2_interleave_sectors = ic->sb->log2_interleave_sectors;
341
342         *area = data_sector >> log2_interleave_sectors;
343         *offset = (unsigned)data_sector & ((1U << log2_interleave_sectors) - 1);
344 }
345
346 #define sector_to_block(ic, n)                                          \
347 do {                                                                    \
348         BUG_ON((n) & (unsigned)((ic)->sectors_per_block - 1));          \
349         (n) >>= (ic)->sb->log2_sectors_per_block;                       \
350 } while (0)
351
352 static __u64 get_metadata_sector_and_offset(struct dm_integrity_c *ic, sector_t area,
353                                             sector_t offset, unsigned *metadata_offset)
354 {
355         __u64 ms;
356         unsigned mo;
357
358         ms = area << ic->sb->log2_interleave_sectors;
359         if (likely(ic->log2_metadata_run >= 0))
360                 ms += area << ic->log2_metadata_run;
361         else
362                 ms += area * ic->metadata_run;
363         ms >>= ic->log2_buffer_sectors;
364
365         sector_to_block(ic, offset);
366
367         if (likely(ic->log2_tag_size >= 0)) {
368                 ms += offset >> (SECTOR_SHIFT + ic->log2_buffer_sectors - ic->log2_tag_size);
369                 mo = (offset << ic->log2_tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1);
370         } else {
371                 ms += (__u64)offset * ic->tag_size >> (SECTOR_SHIFT + ic->log2_buffer_sectors);
372                 mo = (offset * ic->tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1);
373         }
374         *metadata_offset = mo;
375         return ms;
376 }
377
378 static sector_t get_data_sector(struct dm_integrity_c *ic, sector_t area, sector_t offset)
379 {
380         sector_t result;
381
382         result = area << ic->sb->log2_interleave_sectors;
383         if (likely(ic->log2_metadata_run >= 0))
384                 result += (area + 1) << ic->log2_metadata_run;
385         else
386                 result += (area + 1) * ic->metadata_run;
387
388         result += (sector_t)ic->initial_sectors + offset;
389         return result;
390 }
391
392 static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr)
393 {
394         if (unlikely(*sec_ptr >= ic->journal_sections))
395                 *sec_ptr -= ic->journal_sections;
396 }
397
398 static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags)
399 {
400         struct dm_io_request io_req;
401         struct dm_io_region io_loc;
402
403         io_req.bi_op = op;
404         io_req.bi_op_flags = op_flags;
405         io_req.mem.type = DM_IO_KMEM;
406         io_req.mem.ptr.addr = ic->sb;
407         io_req.notify.fn = NULL;
408         io_req.client = ic->io;
409         io_loc.bdev = ic->dev->bdev;
410         io_loc.sector = ic->start;
411         io_loc.count = SB_SECTORS;
412
413         return dm_io(&io_req, 1, &io_loc, NULL);
414 }
415
416 static void access_journal_check(struct dm_integrity_c *ic, unsigned section, unsigned offset,
417                                  bool e, const char *function)
418 {
419 #if defined(CONFIG_DM_DEBUG) || defined(INTERNAL_VERIFY)
420         unsigned limit = e ? ic->journal_section_entries : ic->journal_section_sectors;
421
422         if (unlikely(section >= ic->journal_sections) ||
423             unlikely(offset >= limit)) {
424                 printk(KERN_CRIT "%s: invalid access at (%u,%u), limit (%u,%u)\n",
425                         function, section, offset, ic->journal_sections, limit);
426                 BUG();
427         }
428 #endif
429 }
430
431 static void page_list_location(struct dm_integrity_c *ic, unsigned section, unsigned offset,
432                                unsigned *pl_index, unsigned *pl_offset)
433 {
434         unsigned sector;
435
436         access_journal_check(ic, section, offset, false, "page_list_location");
437
438         sector = section * ic->journal_section_sectors + offset;
439
440         *pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
441         *pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
442 }
443
444 static struct journal_sector *access_page_list(struct dm_integrity_c *ic, struct page_list *pl,
445                                                unsigned section, unsigned offset, unsigned *n_sectors)
446 {
447         unsigned pl_index, pl_offset;
448         char *va;
449
450         page_list_location(ic, section, offset, &pl_index, &pl_offset);
451
452         if (n_sectors)
453                 *n_sectors = (PAGE_SIZE - pl_offset) >> SECTOR_SHIFT;
454
455         va = lowmem_page_address(pl[pl_index].page);
456
457         return (struct journal_sector *)(va + pl_offset);
458 }
459
460 static struct journal_sector *access_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset)
461 {
462         return access_page_list(ic, ic->journal, section, offset, NULL);
463 }
464
465 static struct journal_entry *access_journal_entry(struct dm_integrity_c *ic, unsigned section, unsigned n)
466 {
467         unsigned rel_sector, offset;
468         struct journal_sector *js;
469
470         access_journal_check(ic, section, n, true, "access_journal_entry");
471
472         rel_sector = n % JOURNAL_BLOCK_SECTORS;
473         offset = n / JOURNAL_BLOCK_SECTORS;
474
475         js = access_journal(ic, section, rel_sector);
476         return (struct journal_entry *)((char *)js + offset * ic->journal_entry_size);
477 }
478
479 static struct journal_sector *access_journal_data(struct dm_integrity_c *ic, unsigned section, unsigned n)
480 {
481         n <<= ic->sb->log2_sectors_per_block;
482
483         n += JOURNAL_BLOCK_SECTORS;
484
485         access_journal_check(ic, section, n, false, "access_journal_data");
486
487         return access_journal(ic, section, n);
488 }
489
490 static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result[JOURNAL_MAC_SIZE])
491 {
492         SHASH_DESC_ON_STACK(desc, ic->journal_mac);
493         int r;
494         unsigned j, size;
495
496         desc->tfm = ic->journal_mac;
497         desc->flags = 0;
498
499         r = crypto_shash_init(desc);
500         if (unlikely(r)) {
501                 dm_integrity_io_error(ic, "crypto_shash_init", r);
502                 goto err;
503         }
504
505         for (j = 0; j < ic->journal_section_entries; j++) {
506                 struct journal_entry *je = access_journal_entry(ic, section, j);
507                 r = crypto_shash_update(desc, (__u8 *)&je->u.sector, sizeof je->u.sector);
508                 if (unlikely(r)) {
509                         dm_integrity_io_error(ic, "crypto_shash_update", r);
510                         goto err;
511                 }
512         }
513
514         size = crypto_shash_digestsize(ic->journal_mac);
515
516         if (likely(size <= JOURNAL_MAC_SIZE)) {
517                 r = crypto_shash_final(desc, result);
518                 if (unlikely(r)) {
519                         dm_integrity_io_error(ic, "crypto_shash_final", r);
520                         goto err;
521                 }
522                 memset(result + size, 0, JOURNAL_MAC_SIZE - size);
523         } else {
524                 __u8 digest[size];
525                 r = crypto_shash_final(desc, digest);
526                 if (unlikely(r)) {
527                         dm_integrity_io_error(ic, "crypto_shash_final", r);
528                         goto err;
529                 }
530                 memcpy(result, digest, JOURNAL_MAC_SIZE);
531         }
532
533         return;
534 err:
535         memset(result, 0, JOURNAL_MAC_SIZE);
536 }
537
538 static void rw_section_mac(struct dm_integrity_c *ic, unsigned section, bool wr)
539 {
540         __u8 result[JOURNAL_MAC_SIZE];
541         unsigned j;
542
543         if (!ic->journal_mac)
544                 return;
545
546         section_mac(ic, section, result);
547
548         for (j = 0; j < JOURNAL_BLOCK_SECTORS; j++) {
549                 struct journal_sector *js = access_journal(ic, section, j);
550
551                 if (likely(wr))
552                         memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR);
553                 else {
554                         if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR))
555                                 dm_integrity_io_error(ic, "journal mac", -EILSEQ);
556                 }
557         }
558 }
559
560 static void complete_journal_op(void *context)
561 {
562         struct journal_completion *comp = context;
563         BUG_ON(!atomic_read(&comp->in_flight));
564         if (likely(atomic_dec_and_test(&comp->in_flight)))
565                 complete(&comp->comp);
566 }
567
568 static void xor_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
569                         unsigned n_sections, struct journal_completion *comp)
570 {
571         struct async_submit_ctl submit;
572         size_t n_bytes = (size_t)(n_sections * ic->journal_section_sectors) << SECTOR_SHIFT;
573         unsigned pl_index, pl_offset, section_index;
574         struct page_list *source_pl, *target_pl;
575
576         if (likely(encrypt)) {
577                 source_pl = ic->journal;
578                 target_pl = ic->journal_io;
579         } else {
580                 source_pl = ic->journal_io;
581                 target_pl = ic->journal;
582         }
583
584         page_list_location(ic, section, 0, &pl_index, &pl_offset);
585
586         atomic_add(roundup(pl_offset + n_bytes, PAGE_SIZE) >> PAGE_SHIFT, &comp->in_flight);
587
588         init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, complete_journal_op, comp, NULL);
589
590         section_index = pl_index;
591
592         do {
593                 size_t this_step;
594                 struct page *src_pages[2];
595                 struct page *dst_page;
596
597                 while (unlikely(pl_index == section_index)) {
598                         unsigned dummy;
599                         if (likely(encrypt))
600                                 rw_section_mac(ic, section, true);
601                         section++;
602                         n_sections--;
603                         if (!n_sections)
604                                 break;
605                         page_list_location(ic, section, 0, &section_index, &dummy);
606                 }
607
608                 this_step = min(n_bytes, (size_t)PAGE_SIZE - pl_offset);
609                 dst_page = target_pl[pl_index].page;
610                 src_pages[0] = source_pl[pl_index].page;
611                 src_pages[1] = ic->journal_xor[pl_index].page;
612
613                 async_xor(dst_page, src_pages, pl_offset, 2, this_step, &submit);
614
615                 pl_index++;
616                 pl_offset = 0;
617                 n_bytes -= this_step;
618         } while (n_bytes);
619
620         BUG_ON(n_sections);
621
622         async_tx_issue_pending_all();
623 }
624
625 static void complete_journal_encrypt(struct crypto_async_request *req, int err)
626 {
627         struct journal_completion *comp = req->data;
628         if (unlikely(err)) {
629                 if (likely(err == -EINPROGRESS)) {
630                         complete(&comp->ic->crypto_backoff);
631                         return;
632                 }
633                 dm_integrity_io_error(comp->ic, "asynchronous encrypt", err);
634         }
635         complete_journal_op(comp);
636 }
637
638 static bool do_crypt(bool encrypt, struct skcipher_request *req, struct journal_completion *comp)
639 {
640         int r;
641         skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
642                                       complete_journal_encrypt, comp);
643         if (likely(encrypt))
644                 r = crypto_skcipher_encrypt(req);
645         else
646                 r = crypto_skcipher_decrypt(req);
647         if (likely(!r))
648                 return false;
649         if (likely(r == -EINPROGRESS))
650                 return true;
651         if (likely(r == -EBUSY)) {
652                 wait_for_completion(&comp->ic->crypto_backoff);
653                 reinit_completion(&comp->ic->crypto_backoff);
654                 return true;
655         }
656         dm_integrity_io_error(comp->ic, "encrypt", r);
657         return false;
658 }
659
660 static void crypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
661                           unsigned n_sections, struct journal_completion *comp)
662 {
663         struct scatterlist **source_sg;
664         struct scatterlist **target_sg;
665
666         atomic_add(2, &comp->in_flight);
667
668         if (likely(encrypt)) {
669                 source_sg = ic->journal_scatterlist;
670                 target_sg = ic->journal_io_scatterlist;
671         } else {
672                 source_sg = ic->journal_io_scatterlist;
673                 target_sg = ic->journal_scatterlist;
674         }
675
676         do {
677                 struct skcipher_request *req;
678                 unsigned ivsize;
679                 char *iv;
680
681                 if (likely(encrypt))
682                         rw_section_mac(ic, section, true);
683
684                 req = ic->sk_requests[section];
685                 ivsize = crypto_skcipher_ivsize(ic->journal_crypt);
686                 iv = req->iv;
687
688                 memcpy(iv, iv + ivsize, ivsize);
689
690                 req->src = source_sg[section];
691                 req->dst = target_sg[section];
692
693                 if (unlikely(do_crypt(encrypt, req, comp)))
694                         atomic_inc(&comp->in_flight);
695
696                 section++;
697                 n_sections--;
698         } while (n_sections);
699
700         atomic_dec(&comp->in_flight);
701         complete_journal_op(comp);
702 }
703
704 static void encrypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
705                             unsigned n_sections, struct journal_completion *comp)
706 {
707         if (ic->journal_xor)
708                 return xor_journal(ic, encrypt, section, n_sections, comp);
709         else
710                 return crypt_journal(ic, encrypt, section, n_sections, comp);
711 }
712
713 static void complete_journal_io(unsigned long error, void *context)
714 {
715         struct journal_completion *comp = context;
716         if (unlikely(error != 0))
717                 dm_integrity_io_error(comp->ic, "writing journal", -EIO);
718         complete_journal_op(comp);
719 }
720
721 static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section,
722                        unsigned n_sections, struct journal_completion *comp)
723 {
724         struct dm_io_request io_req;
725         struct dm_io_region io_loc;
726         unsigned sector, n_sectors, pl_index, pl_offset;
727         int r;
728
729         if (unlikely(dm_integrity_failed(ic))) {
730                 if (comp)
731                         complete_journal_io(-1UL, comp);
732                 return;
733         }
734
735         sector = section * ic->journal_section_sectors;
736         n_sectors = n_sections * ic->journal_section_sectors;
737
738         pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
739         pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
740
741         io_req.bi_op = op;
742         io_req.bi_op_flags = op_flags;
743         io_req.mem.type = DM_IO_PAGE_LIST;
744         if (ic->journal_io)
745                 io_req.mem.ptr.pl = &ic->journal_io[pl_index];
746         else
747                 io_req.mem.ptr.pl = &ic->journal[pl_index];
748         io_req.mem.offset = pl_offset;
749         if (likely(comp != NULL)) {
750                 io_req.notify.fn = complete_journal_io;
751                 io_req.notify.context = comp;
752         } else {
753                 io_req.notify.fn = NULL;
754         }
755         io_req.client = ic->io;
756         io_loc.bdev = ic->dev->bdev;
757         io_loc.sector = ic->start + SB_SECTORS + sector;
758         io_loc.count = n_sectors;
759
760         r = dm_io(&io_req, 1, &io_loc, NULL);
761         if (unlikely(r)) {
762                 dm_integrity_io_error(ic, op == REQ_OP_READ ? "reading journal" : "writing journal", r);
763                 if (comp) {
764                         WARN_ONCE(1, "asynchronous dm_io failed: %d", r);
765                         complete_journal_io(-1UL, comp);
766                 }
767         }
768 }
769
770 static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsigned commit_sections)
771 {
772         struct journal_completion io_comp;
773         struct journal_completion crypt_comp_1;
774         struct journal_completion crypt_comp_2;
775         unsigned i;
776
777         io_comp.ic = ic;
778         init_completion(&io_comp.comp);
779
780         if (commit_start + commit_sections <= ic->journal_sections) {
781                 io_comp.in_flight = (atomic_t)ATOMIC_INIT(1);
782                 if (ic->journal_io) {
783                         crypt_comp_1.ic = ic;
784                         init_completion(&crypt_comp_1.comp);
785                         crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
786                         encrypt_journal(ic, true, commit_start, commit_sections, &crypt_comp_1);
787                         wait_for_completion_io(&crypt_comp_1.comp);
788                 } else {
789                         for (i = 0; i < commit_sections; i++)
790                                 rw_section_mac(ic, commit_start + i, true);
791                 }
792                 rw_journal(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, commit_start,
793                            commit_sections, &io_comp);
794         } else {
795                 unsigned to_end;
796                 io_comp.in_flight = (atomic_t)ATOMIC_INIT(2);
797                 to_end = ic->journal_sections - commit_start;
798                 if (ic->journal_io) {
799                         crypt_comp_1.ic = ic;
800                         init_completion(&crypt_comp_1.comp);
801                         crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
802                         encrypt_journal(ic, true, commit_start, to_end, &crypt_comp_1);
803                         if (try_wait_for_completion(&crypt_comp_1.comp)) {
804                                 rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
805                                 reinit_completion(&crypt_comp_1.comp);
806                                 crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
807                                 encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_1);
808                                 wait_for_completion_io(&crypt_comp_1.comp);
809                         } else {
810                                 crypt_comp_2.ic = ic;
811                                 init_completion(&crypt_comp_2.comp);
812                                 crypt_comp_2.in_flight = (atomic_t)ATOMIC_INIT(0);
813                                 encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_2);
814                                 wait_for_completion_io(&crypt_comp_1.comp);
815                                 rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
816                                 wait_for_completion_io(&crypt_comp_2.comp);
817                         }
818                 } else {
819                         for (i = 0; i < to_end; i++)
820                                 rw_section_mac(ic, commit_start + i, true);
821                         rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
822                         for (i = 0; i < commit_sections - to_end; i++)
823                                 rw_section_mac(ic, i, true);
824                 }
825                 rw_journal(ic, REQ_OP_WRITE, REQ_FUA, 0, commit_sections - to_end, &io_comp);
826         }
827
828         wait_for_completion_io(&io_comp.comp);
829 }
830
831 static void copy_from_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset,
832                               unsigned n_sectors, sector_t target, io_notify_fn fn, void *data)
833 {
834         struct dm_io_request io_req;
835         struct dm_io_region io_loc;
836         int r;
837         unsigned sector, pl_index, pl_offset;
838
839         BUG_ON((target | n_sectors | offset) & (unsigned)(ic->sectors_per_block - 1));
840
841         if (unlikely(dm_integrity_failed(ic))) {
842                 fn(-1UL, data);
843                 return;
844         }
845
846         sector = section * ic->journal_section_sectors + JOURNAL_BLOCK_SECTORS + offset;
847
848         pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
849         pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
850
851         io_req.bi_op = REQ_OP_WRITE;
852         io_req.bi_op_flags = 0;
853         io_req.mem.type = DM_IO_PAGE_LIST;
854         io_req.mem.ptr.pl = &ic->journal[pl_index];
855         io_req.mem.offset = pl_offset;
856         io_req.notify.fn = fn;
857         io_req.notify.context = data;
858         io_req.client = ic->io;
859         io_loc.bdev = ic->dev->bdev;
860         io_loc.sector = ic->start + target;
861         io_loc.count = n_sectors;
862
863         r = dm_io(&io_req, 1, &io_loc, NULL);
864         if (unlikely(r)) {
865                 WARN_ONCE(1, "asynchronous dm_io failed: %d", r);
866                 fn(-1UL, data);
867         }
868 }
869
870 static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range)
871 {
872         struct rb_node **n = &ic->in_progress.rb_node;
873         struct rb_node *parent;
874
875         BUG_ON((new_range->logical_sector | new_range->n_sectors) & (unsigned)(ic->sectors_per_block - 1));
876
877         parent = NULL;
878
879         while (*n) {
880                 struct dm_integrity_range *range = container_of(*n, struct dm_integrity_range, node);
881
882                 parent = *n;
883                 if (new_range->logical_sector + new_range->n_sectors <= range->logical_sector) {
884                         n = &range->node.rb_left;
885                 } else if (new_range->logical_sector >= range->logical_sector + range->n_sectors) {
886                         n = &range->node.rb_right;
887                 } else {
888                         return false;
889                 }
890         }
891
892         rb_link_node(&new_range->node, parent, n);
893         rb_insert_color(&new_range->node, &ic->in_progress);
894
895         return true;
896 }
897
898 static void remove_range_unlocked(struct dm_integrity_c *ic, struct dm_integrity_range *range)
899 {
900         rb_erase(&range->node, &ic->in_progress);
901         wake_up_locked(&ic->endio_wait);
902 }
903
904 static void remove_range(struct dm_integrity_c *ic, struct dm_integrity_range *range)
905 {
906         unsigned long flags;
907
908         spin_lock_irqsave(&ic->endio_wait.lock, flags);
909         remove_range_unlocked(ic, range);
910         spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
911 }
912
913 static void init_journal_node(struct journal_node *node)
914 {
915         RB_CLEAR_NODE(&node->node);
916         node->sector = (sector_t)-1;
917 }
918
919 static void add_journal_node(struct dm_integrity_c *ic, struct journal_node *node, sector_t sector)
920 {
921         struct rb_node **link;
922         struct rb_node *parent;
923
924         node->sector = sector;
925         BUG_ON(!RB_EMPTY_NODE(&node->node));
926
927         link = &ic->journal_tree_root.rb_node;
928         parent = NULL;
929
930         while (*link) {
931                 struct journal_node *j;
932                 parent = *link;
933                 j = container_of(parent, struct journal_node, node);
934                 if (sector < j->sector)
935                         link = &j->node.rb_left;
936                 else
937                         link = &j->node.rb_right;
938         }
939
940         rb_link_node(&node->node, parent, link);
941         rb_insert_color(&node->node, &ic->journal_tree_root);
942 }
943
944 static void remove_journal_node(struct dm_integrity_c *ic, struct journal_node *node)
945 {
946         BUG_ON(RB_EMPTY_NODE(&node->node));
947         rb_erase(&node->node, &ic->journal_tree_root);
948         init_journal_node(node);
949 }
950
951 #define NOT_FOUND       (-1U)
952
953 static unsigned find_journal_node(struct dm_integrity_c *ic, sector_t sector, sector_t *next_sector)
954 {
955         struct rb_node *n = ic->journal_tree_root.rb_node;
956         unsigned found = NOT_FOUND;
957         *next_sector = (sector_t)-1;
958         while (n) {
959                 struct journal_node *j = container_of(n, struct journal_node, node);
960                 if (sector == j->sector) {
961                         found = j - ic->journal_tree;
962                 }
963                 if (sector < j->sector) {
964                         *next_sector = j->sector;
965                         n = j->node.rb_left;
966                 } else {
967                         n = j->node.rb_right;
968                 }
969         }
970
971         return found;
972 }
973
974 static bool test_journal_node(struct dm_integrity_c *ic, unsigned pos, sector_t sector)
975 {
976         struct journal_node *node, *next_node;
977         struct rb_node *next;
978
979         if (unlikely(pos >= ic->journal_entries))
980                 return false;
981         node = &ic->journal_tree[pos];
982         if (unlikely(RB_EMPTY_NODE(&node->node)))
983                 return false;
984         if (unlikely(node->sector != sector))
985                 return false;
986
987         next = rb_next(&node->node);
988         if (unlikely(!next))
989                 return true;
990
991         next_node = container_of(next, struct journal_node, node);
992         return next_node->sector != sector;
993 }
994
995 static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_node *node)
996 {
997         struct rb_node *next;
998         struct journal_node *next_node;
999         unsigned next_section;
1000
1001         BUG_ON(RB_EMPTY_NODE(&node->node));
1002
1003         next = rb_next(&node->node);
1004         if (unlikely(!next))
1005                 return false;
1006
1007         next_node = container_of(next, struct journal_node, node);
1008
1009         if (next_node->sector != node->sector)
1010                 return false;
1011
1012         next_section = (unsigned)(next_node - ic->journal_tree) / ic->journal_section_entries;
1013         if (next_section >= ic->committed_section &&
1014             next_section < ic->committed_section + ic->n_committed_sections)
1015                 return true;
1016         if (next_section + ic->journal_sections < ic->committed_section + ic->n_committed_sections)
1017                 return true;
1018
1019         return false;
1020 }
1021
1022 #define TAG_READ        0
1023 #define TAG_WRITE       1
1024 #define TAG_CMP         2
1025
1026 static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block,
1027                                unsigned *metadata_offset, unsigned total_size, int op)
1028 {
1029         do {
1030                 unsigned char *data, *dp;
1031                 struct dm_buffer *b;
1032                 unsigned to_copy;
1033                 int r;
1034
1035                 r = dm_integrity_failed(ic);
1036                 if (unlikely(r))
1037                         return r;
1038
1039                 data = dm_bufio_read(ic->bufio, *metadata_block, &b);
1040                 if (unlikely(IS_ERR(data)))
1041                         return PTR_ERR(data);
1042
1043                 to_copy = min((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - *metadata_offset, total_size);
1044                 dp = data + *metadata_offset;
1045                 if (op == TAG_READ) {
1046                         memcpy(tag, dp, to_copy);
1047                 } else if (op == TAG_WRITE) {
1048                         memcpy(dp, tag, to_copy);
1049                         dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy);
1050                 } else  {
1051                         /* e.g.: op == TAG_CMP */
1052                         if (unlikely(memcmp(dp, tag, to_copy))) {
1053                                 unsigned i;
1054
1055                                 for (i = 0; i < to_copy; i++) {
1056                                         if (dp[i] != tag[i])
1057                                                 break;
1058                                         total_size--;
1059                                 }
1060                                 dm_bufio_release(b);
1061                                 return total_size;
1062                         }
1063                 }
1064                 dm_bufio_release(b);
1065
1066                 tag += to_copy;
1067                 *metadata_offset += to_copy;
1068                 if (unlikely(*metadata_offset == 1U << SECTOR_SHIFT << ic->log2_buffer_sectors)) {
1069                         (*metadata_block)++;
1070                         *metadata_offset = 0;
1071                 }
1072                 total_size -= to_copy;
1073         } while (unlikely(total_size));
1074
1075         return 0;
1076 }
1077
1078 static void dm_integrity_flush_buffers(struct dm_integrity_c *ic)
1079 {
1080         int r;
1081         r = dm_bufio_write_dirty_buffers(ic->bufio);
1082         if (unlikely(r))
1083                 dm_integrity_io_error(ic, "writing tags", r);
1084 }
1085
1086 static void sleep_on_endio_wait(struct dm_integrity_c *ic)
1087 {
1088         DECLARE_WAITQUEUE(wait, current);
1089         __add_wait_queue(&ic->endio_wait, &wait);
1090         __set_current_state(TASK_UNINTERRUPTIBLE);
1091         spin_unlock_irq(&ic->endio_wait.lock);
1092         io_schedule();
1093         spin_lock_irq(&ic->endio_wait.lock);
1094         __remove_wait_queue(&ic->endio_wait, &wait);
1095 }
1096
1097 static void autocommit_fn(unsigned long data)
1098 {
1099         struct dm_integrity_c *ic = (struct dm_integrity_c *)data;
1100
1101         if (likely(!dm_integrity_failed(ic)))
1102                 queue_work(ic->commit_wq, &ic->commit_work);
1103 }
1104
1105 static void schedule_autocommit(struct dm_integrity_c *ic)
1106 {
1107         if (!timer_pending(&ic->autocommit_timer))
1108                 mod_timer(&ic->autocommit_timer, jiffies + ic->autocommit_jiffies);
1109 }
1110
1111 static void submit_flush_bio(struct dm_integrity_c *ic, struct dm_integrity_io *dio)
1112 {
1113         struct bio *bio;
1114         unsigned long flags;
1115
1116         spin_lock_irqsave(&ic->endio_wait.lock, flags);
1117         bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
1118         bio_list_add(&ic->flush_bio_list, bio);
1119         spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
1120
1121         queue_work(ic->commit_wq, &ic->commit_work);
1122 }
1123
1124 static void do_endio(struct dm_integrity_c *ic, struct bio *bio)
1125 {
1126         int r = dm_integrity_failed(ic);
1127         if (unlikely(r) && !bio->bi_status)
1128                 bio->bi_status = errno_to_blk_status(r);
1129         bio_endio(bio);
1130 }
1131
1132 static void do_endio_flush(struct dm_integrity_c *ic, struct dm_integrity_io *dio)
1133 {
1134         struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
1135
1136         if (unlikely(dio->fua) && likely(!bio->bi_status) && likely(!dm_integrity_failed(ic)))
1137                 submit_flush_bio(ic, dio);
1138         else
1139                 do_endio(ic, bio);
1140 }
1141
1142 static void dec_in_flight(struct dm_integrity_io *dio)
1143 {
1144         if (atomic_dec_and_test(&dio->in_flight)) {
1145                 struct dm_integrity_c *ic = dio->ic;
1146                 struct bio *bio;
1147
1148                 remove_range(ic, &dio->range);
1149
1150                 if (unlikely(dio->write))
1151                         schedule_autocommit(ic);
1152
1153                 bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
1154
1155                 if (unlikely(dio->bi_status) && !bio->bi_status)
1156                         bio->bi_status = dio->bi_status;
1157                 if (likely(!bio->bi_status) && unlikely(bio_sectors(bio) != dio->range.n_sectors)) {
1158                         dio->range.logical_sector += dio->range.n_sectors;
1159                         bio_advance(bio, dio->range.n_sectors << SECTOR_SHIFT);
1160                         INIT_WORK(&dio->work, integrity_bio_wait);
1161                         queue_work(ic->offload_wq, &dio->work);
1162                         return;
1163                 }
1164                 do_endio_flush(ic, dio);
1165         }
1166 }
1167
1168 static void integrity_end_io(struct bio *bio)
1169 {
1170         struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
1171
1172         bio->bi_iter = dio->orig_bi_iter;
1173         bio->bi_disk = dio->orig_bi_disk;
1174         bio->bi_partno = dio->orig_bi_partno;
1175         if (dio->orig_bi_integrity) {
1176                 bio->bi_integrity = dio->orig_bi_integrity;
1177                 bio->bi_opf |= REQ_INTEGRITY;
1178         }
1179         bio->bi_end_io = dio->orig_bi_end_io;
1180
1181         if (dio->completion)
1182                 complete(dio->completion);
1183
1184         dec_in_flight(dio);
1185 }
1186
1187 static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector,
1188                                       const char *data, char *result)
1189 {
1190         __u64 sector_le = cpu_to_le64(sector);
1191         SHASH_DESC_ON_STACK(req, ic->internal_hash);
1192         int r;
1193         unsigned digest_size;
1194
1195         req->tfm = ic->internal_hash;
1196         req->flags = 0;
1197
1198         r = crypto_shash_init(req);
1199         if (unlikely(r < 0)) {
1200                 dm_integrity_io_error(ic, "crypto_shash_init", r);
1201                 goto failed;
1202         }
1203
1204         r = crypto_shash_update(req, (const __u8 *)&sector_le, sizeof sector_le);
1205         if (unlikely(r < 0)) {
1206                 dm_integrity_io_error(ic, "crypto_shash_update", r);
1207                 goto failed;
1208         }
1209
1210         r = crypto_shash_update(req, data, ic->sectors_per_block << SECTOR_SHIFT);
1211         if (unlikely(r < 0)) {
1212                 dm_integrity_io_error(ic, "crypto_shash_update", r);
1213                 goto failed;
1214         }
1215
1216         r = crypto_shash_final(req, result);
1217         if (unlikely(r < 0)) {
1218                 dm_integrity_io_error(ic, "crypto_shash_final", r);
1219                 goto failed;
1220         }
1221
1222         digest_size = crypto_shash_digestsize(ic->internal_hash);
1223         if (unlikely(digest_size < ic->tag_size))
1224                 memset(result + digest_size, 0, ic->tag_size - digest_size);
1225
1226         return;
1227
1228 failed:
1229         /* this shouldn't happen anyway, the hash functions have no reason to fail */
1230         get_random_bytes(result, ic->tag_size);
1231 }
1232
1233 static void integrity_metadata(struct work_struct *w)
1234 {
1235         struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
1236         struct dm_integrity_c *ic = dio->ic;
1237
1238         int r;
1239
1240         if (ic->internal_hash) {
1241                 struct bvec_iter iter;
1242                 struct bio_vec bv;
1243                 unsigned digest_size = crypto_shash_digestsize(ic->internal_hash);
1244                 struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
1245                 char *checksums;
1246                 unsigned extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
1247                 char checksums_onstack[ic->tag_size + extra_space];
1248                 unsigned sectors_to_process = dio->range.n_sectors;
1249                 sector_t sector = dio->range.logical_sector;
1250
1251                 if (unlikely(ic->mode == 'R'))
1252                         goto skip_io;
1253
1254                 checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space,
1255                                     GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
1256                 if (!checksums)
1257                         checksums = checksums_onstack;
1258
1259                 __bio_for_each_segment(bv, bio, iter, dio->orig_bi_iter) {
1260                         unsigned pos;
1261                         char *mem, *checksums_ptr;
1262
1263 again:
1264                         mem = (char *)kmap_atomic(bv.bv_page) + bv.bv_offset;
1265                         pos = 0;
1266                         checksums_ptr = checksums;
1267                         do {
1268                                 integrity_sector_checksum(ic, sector, mem + pos, checksums_ptr);
1269                                 checksums_ptr += ic->tag_size;
1270                                 sectors_to_process -= ic->sectors_per_block;
1271                                 pos += ic->sectors_per_block << SECTOR_SHIFT;
1272                                 sector += ic->sectors_per_block;
1273                         } while (pos < bv.bv_len && sectors_to_process && checksums != checksums_onstack);
1274                         kunmap_atomic(mem);
1275
1276                         r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
1277                                                 checksums_ptr - checksums, !dio->write ? TAG_CMP : TAG_WRITE);
1278                         if (unlikely(r)) {
1279                                 if (r > 0) {
1280                                         DMERR_LIMIT("Checksum failed at sector 0x%llx",
1281                                                     (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size)));
1282                                         r = -EILSEQ;
1283                                         atomic64_inc(&ic->number_of_mismatches);
1284                                 }
1285                                 if (likely(checksums != checksums_onstack))
1286                                         kfree(checksums);
1287                                 goto error;
1288                         }
1289
1290                         if (!sectors_to_process)
1291                                 break;
1292
1293                         if (unlikely(pos < bv.bv_len)) {
1294                                 bv.bv_offset += pos;
1295                                 bv.bv_len -= pos;
1296                                 goto again;
1297                         }
1298                 }
1299
1300                 if (likely(checksums != checksums_onstack))
1301                         kfree(checksums);
1302         } else {
1303                 struct bio_integrity_payload *bip = dio->orig_bi_integrity;
1304
1305                 if (bip) {
1306                         struct bio_vec biv;
1307                         struct bvec_iter iter;
1308                         unsigned data_to_process = dio->range.n_sectors;
1309                         sector_to_block(ic, data_to_process);
1310                         data_to_process *= ic->tag_size;
1311
1312                         bip_for_each_vec(biv, bip, iter) {
1313                                 unsigned char *tag;
1314                                 unsigned this_len;
1315
1316                                 BUG_ON(PageHighMem(biv.bv_page));
1317                                 tag = lowmem_page_address(biv.bv_page) + biv.bv_offset;
1318                                 this_len = min(biv.bv_len, data_to_process);
1319                                 r = dm_integrity_rw_tag(ic, tag, &dio->metadata_block, &dio->metadata_offset,
1320                                                         this_len, !dio->write ? TAG_READ : TAG_WRITE);
1321                                 if (unlikely(r))
1322                                         goto error;
1323                                 data_to_process -= this_len;
1324                                 if (!data_to_process)
1325                                         break;
1326                         }
1327                 }
1328         }
1329 skip_io:
1330         dec_in_flight(dio);
1331         return;
1332 error:
1333         dio->bi_status = errno_to_blk_status(r);
1334         dec_in_flight(dio);
1335 }
1336
1337 static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
1338 {
1339         struct dm_integrity_c *ic = ti->private;
1340         struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
1341         struct bio_integrity_payload *bip;
1342
1343         sector_t area, offset;
1344
1345         dio->ic = ic;
1346         dio->bi_status = 0;
1347
1348         if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1349                 submit_flush_bio(ic, dio);
1350                 return DM_MAPIO_SUBMITTED;
1351         }
1352
1353         dio->range.logical_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
1354         dio->write = bio_op(bio) == REQ_OP_WRITE;
1355         dio->fua = dio->write && bio->bi_opf & REQ_FUA;
1356         if (unlikely(dio->fua)) {
1357                 /*
1358                  * Don't pass down the FUA flag because we have to flush
1359                  * disk cache anyway.
1360                  */
1361                 bio->bi_opf &= ~REQ_FUA;
1362         }
1363         if (unlikely(dio->range.logical_sector + bio_sectors(bio) > ic->provided_data_sectors)) {
1364                 DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx",
1365                       (unsigned long long)dio->range.logical_sector, bio_sectors(bio),
1366                       (unsigned long long)ic->provided_data_sectors);
1367                 return DM_MAPIO_KILL;
1368         }
1369         if (unlikely((dio->range.logical_sector | bio_sectors(bio)) & (unsigned)(ic->sectors_per_block - 1))) {
1370                 DMERR("Bio not aligned on %u sectors: 0x%llx, 0x%x",
1371                       ic->sectors_per_block,
1372                       (unsigned long long)dio->range.logical_sector, bio_sectors(bio));
1373                 return DM_MAPIO_KILL;
1374         }
1375
1376         if (ic->sectors_per_block > 1) {
1377                 struct bvec_iter iter;
1378                 struct bio_vec bv;
1379                 bio_for_each_segment(bv, bio, iter) {
1380                         if (unlikely(bv.bv_len & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) {
1381                                 DMERR("Bio vector (%u,%u) is not aligned on %u-sector boundary",
1382                                         bv.bv_offset, bv.bv_len, ic->sectors_per_block);
1383                                 return DM_MAPIO_KILL;
1384                         }
1385                 }
1386         }
1387
1388         bip = bio_integrity(bio);
1389         if (!ic->internal_hash) {
1390                 if (bip) {
1391                         unsigned wanted_tag_size = bio_sectors(bio) >> ic->sb->log2_sectors_per_block;
1392                         if (ic->log2_tag_size >= 0)
1393                                 wanted_tag_size <<= ic->log2_tag_size;
1394                         else
1395                                 wanted_tag_size *= ic->tag_size;
1396                         if (unlikely(wanted_tag_size != bip->bip_iter.bi_size)) {
1397                                 DMERR("Invalid integrity data size %u, expected %u", bip->bip_iter.bi_size, wanted_tag_size);
1398                                 return DM_MAPIO_KILL;
1399                         }
1400                 }
1401         } else {
1402                 if (unlikely(bip != NULL)) {
1403                         DMERR("Unexpected integrity data when using internal hash");
1404                         return DM_MAPIO_KILL;
1405                 }
1406         }
1407
1408         if (unlikely(ic->mode == 'R') && unlikely(dio->write))
1409                 return DM_MAPIO_KILL;
1410
1411         get_area_and_offset(ic, dio->range.logical_sector, &area, &offset);
1412         dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset);
1413         bio->bi_iter.bi_sector = get_data_sector(ic, area, offset);
1414
1415         dm_integrity_map_continue(dio, true);
1416         return DM_MAPIO_SUBMITTED;
1417 }
1418
1419 static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio,
1420                                  unsigned journal_section, unsigned journal_entry)
1421 {
1422         struct dm_integrity_c *ic = dio->ic;
1423         sector_t logical_sector;
1424         unsigned n_sectors;
1425
1426         logical_sector = dio->range.logical_sector;
1427         n_sectors = dio->range.n_sectors;
1428         do {
1429                 struct bio_vec bv = bio_iovec(bio);
1430                 char *mem;
1431
1432                 if (unlikely(bv.bv_len >> SECTOR_SHIFT > n_sectors))
1433                         bv.bv_len = n_sectors << SECTOR_SHIFT;
1434                 n_sectors -= bv.bv_len >> SECTOR_SHIFT;
1435                 bio_advance_iter(bio, &bio->bi_iter, bv.bv_len);
1436 retry_kmap:
1437                 mem = kmap_atomic(bv.bv_page);
1438                 if (likely(dio->write))
1439                         flush_dcache_page(bv.bv_page);
1440
1441                 do {
1442                         struct journal_entry *je = access_journal_entry(ic, journal_section, journal_entry);
1443
1444                         if (unlikely(!dio->write)) {
1445                                 struct journal_sector *js;
1446                                 char *mem_ptr;
1447                                 unsigned s;
1448
1449                                 if (unlikely(journal_entry_is_inprogress(je))) {
1450                                         flush_dcache_page(bv.bv_page);
1451                                         kunmap_atomic(mem);
1452
1453                                         __io_wait_event(ic->copy_to_journal_wait, !journal_entry_is_inprogress(je));
1454                                         goto retry_kmap;
1455                                 }
1456                                 smp_rmb();
1457                                 BUG_ON(journal_entry_get_sector(je) != logical_sector);
1458                                 js = access_journal_data(ic, journal_section, journal_entry);
1459                                 mem_ptr = mem + bv.bv_offset;
1460                                 s = 0;
1461                                 do {
1462                                         memcpy(mem_ptr, js, JOURNAL_SECTOR_DATA);
1463                                         *(commit_id_t *)(mem_ptr + JOURNAL_SECTOR_DATA) = je->last_bytes[s];
1464                                         js++;
1465                                         mem_ptr += 1 << SECTOR_SHIFT;
1466                                 } while (++s < ic->sectors_per_block);
1467 #ifdef INTERNAL_VERIFY
1468                                 if (ic->internal_hash) {
1469                                         char checksums_onstack[max(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)];
1470
1471                                         integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack);
1472                                         if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
1473                                                 DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx",
1474                                                             (unsigned long long)logical_sector);
1475                                         }
1476                                 }
1477 #endif
1478                         }
1479
1480                         if (!ic->internal_hash) {
1481                                 struct bio_integrity_payload *bip = bio_integrity(bio);
1482                                 unsigned tag_todo = ic->tag_size;
1483                                 char *tag_ptr = journal_entry_tag(ic, je);
1484
1485                                 if (bip) do {
1486                                         struct bio_vec biv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter);
1487                                         unsigned tag_now = min(biv.bv_len, tag_todo);
1488                                         char *tag_addr;
1489                                         BUG_ON(PageHighMem(biv.bv_page));
1490                                         tag_addr = lowmem_page_address(biv.bv_page) + biv.bv_offset;
1491                                         if (likely(dio->write))
1492                                                 memcpy(tag_ptr, tag_addr, tag_now);
1493                                         else
1494                                                 memcpy(tag_addr, tag_ptr, tag_now);
1495                                         bvec_iter_advance(bip->bip_vec, &bip->bip_iter, tag_now);
1496                                         tag_ptr += tag_now;
1497                                         tag_todo -= tag_now;
1498                                 } while (unlikely(tag_todo)); else {
1499                                         if (likely(dio->write))
1500                                                 memset(tag_ptr, 0, tag_todo);
1501                                 }
1502                         }
1503
1504                         if (likely(dio->write)) {
1505                                 struct journal_sector *js;
1506                                 unsigned s;
1507
1508                                 js = access_journal_data(ic, journal_section, journal_entry);
1509                                 memcpy(js, mem + bv.bv_offset, ic->sectors_per_block << SECTOR_SHIFT);
1510
1511                                 s = 0;
1512                                 do {
1513                                         je->last_bytes[s] = js[s].commit_id;
1514                                 } while (++s < ic->sectors_per_block);
1515
1516                                 if (ic->internal_hash) {
1517                                         unsigned digest_size = crypto_shash_digestsize(ic->internal_hash);
1518                                         if (unlikely(digest_size > ic->tag_size)) {
1519                                                 char checksums_onstack[digest_size];
1520                                                 integrity_sector_checksum(ic, logical_sector, (char *)js, checksums_onstack);
1521                                                 memcpy(journal_entry_tag(ic, je), checksums_onstack, ic->tag_size);
1522                                         } else
1523                                                 integrity_sector_checksum(ic, logical_sector, (char *)js, journal_entry_tag(ic, je));
1524                                 }
1525
1526                                 journal_entry_set_sector(je, logical_sector);
1527                         }
1528                         logical_sector += ic->sectors_per_block;
1529
1530                         journal_entry++;
1531                         if (unlikely(journal_entry == ic->journal_section_entries)) {
1532                                 journal_entry = 0;
1533                                 journal_section++;
1534                                 wraparound_section(ic, &journal_section);
1535                         }
1536
1537                         bv.bv_offset += ic->sectors_per_block << SECTOR_SHIFT;
1538                 } while (bv.bv_len -= ic->sectors_per_block << SECTOR_SHIFT);
1539
1540                 if (unlikely(!dio->write))
1541                         flush_dcache_page(bv.bv_page);
1542                 kunmap_atomic(mem);
1543         } while (n_sectors);
1544
1545         if (likely(dio->write)) {
1546                 smp_mb();
1547                 if (unlikely(waitqueue_active(&ic->copy_to_journal_wait)))
1548                         wake_up(&ic->copy_to_journal_wait);
1549                 if (ACCESS_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) {
1550                         queue_work(ic->commit_wq, &ic->commit_work);
1551                 } else {
1552                         schedule_autocommit(ic);
1553                 }
1554         } else {
1555                 remove_range(ic, &dio->range);
1556         }
1557
1558         if (unlikely(bio->bi_iter.bi_size)) {
1559                 sector_t area, offset;
1560
1561                 dio->range.logical_sector = logical_sector;
1562                 get_area_and_offset(ic, dio->range.logical_sector, &area, &offset);
1563                 dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset);
1564                 return true;
1565         }
1566
1567         return false;
1568 }
1569
1570 static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map)
1571 {
1572         struct dm_integrity_c *ic = dio->ic;
1573         struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
1574         unsigned journal_section, journal_entry;
1575         unsigned journal_read_pos;
1576         struct completion read_comp;
1577         bool need_sync_io = ic->internal_hash && !dio->write;
1578
1579         if (need_sync_io && from_map) {
1580                 INIT_WORK(&dio->work, integrity_bio_wait);
1581                 queue_work(ic->offload_wq, &dio->work);
1582                 return;
1583         }
1584
1585 lock_retry:
1586         spin_lock_irq(&ic->endio_wait.lock);
1587 retry:
1588         if (unlikely(dm_integrity_failed(ic))) {
1589                 spin_unlock_irq(&ic->endio_wait.lock);
1590                 do_endio(ic, bio);
1591                 return;
1592         }
1593         dio->range.n_sectors = bio_sectors(bio);
1594         journal_read_pos = NOT_FOUND;
1595         if (likely(ic->mode == 'J')) {
1596                 if (dio->write) {
1597                         unsigned next_entry, i, pos;
1598                         unsigned ws, we, range_sectors;
1599
1600                         dio->range.n_sectors = min(dio->range.n_sectors,
1601                                                    ic->free_sectors << ic->sb->log2_sectors_per_block);
1602                         if (unlikely(!dio->range.n_sectors))
1603                                 goto sleep;
1604                         range_sectors = dio->range.n_sectors >> ic->sb->log2_sectors_per_block;
1605                         ic->free_sectors -= range_sectors;
1606                         journal_section = ic->free_section;
1607                         journal_entry = ic->free_section_entry;
1608
1609                         next_entry = ic->free_section_entry + range_sectors;
1610                         ic->free_section_entry = next_entry % ic->journal_section_entries;
1611                         ic->free_section += next_entry / ic->journal_section_entries;
1612                         ic->n_uncommitted_sections += next_entry / ic->journal_section_entries;
1613                         wraparound_section(ic, &ic->free_section);
1614
1615                         pos = journal_section * ic->journal_section_entries + journal_entry;
1616                         ws = journal_section;
1617                         we = journal_entry;
1618                         i = 0;
1619                         do {
1620                                 struct journal_entry *je;
1621
1622                                 add_journal_node(ic, &ic->journal_tree[pos], dio->range.logical_sector + i);
1623                                 pos++;
1624                                 if (unlikely(pos >= ic->journal_entries))
1625                                         pos = 0;
1626
1627                                 je = access_journal_entry(ic, ws, we);
1628                                 BUG_ON(!journal_entry_is_unused(je));
1629                                 journal_entry_set_inprogress(je);
1630                                 we++;
1631                                 if (unlikely(we == ic->journal_section_entries)) {
1632                                         we = 0;
1633                                         ws++;
1634                                         wraparound_section(ic, &ws);
1635                                 }
1636                         } while ((i += ic->sectors_per_block) < dio->range.n_sectors);
1637
1638                         spin_unlock_irq(&ic->endio_wait.lock);
1639                         goto journal_read_write;
1640                 } else {
1641                         sector_t next_sector;
1642                         journal_read_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector);
1643                         if (likely(journal_read_pos == NOT_FOUND)) {
1644                                 if (unlikely(dio->range.n_sectors > next_sector - dio->range.logical_sector))
1645                                         dio->range.n_sectors = next_sector - dio->range.logical_sector;
1646                         } else {
1647                                 unsigned i;
1648                                 unsigned jp = journal_read_pos + 1;
1649                                 for (i = ic->sectors_per_block; i < dio->range.n_sectors; i += ic->sectors_per_block, jp++) {
1650                                         if (!test_journal_node(ic, jp, dio->range.logical_sector + i))
1651                                                 break;
1652                                 }
1653                                 dio->range.n_sectors = i;
1654                         }
1655                 }
1656         }
1657         if (unlikely(!add_new_range(ic, &dio->range))) {
1658                 /*
1659                  * We must not sleep in the request routine because it could
1660                  * stall bios on current->bio_list.
1661                  * So, we offload the bio to a workqueue if we have to sleep.
1662                  */
1663 sleep:
1664                 if (from_map) {
1665                         spin_unlock_irq(&ic->endio_wait.lock);
1666                         INIT_WORK(&dio->work, integrity_bio_wait);
1667                         queue_work(ic->wait_wq, &dio->work);
1668                         return;
1669                 } else {
1670                         sleep_on_endio_wait(ic);
1671                         goto retry;
1672                 }
1673         }
1674         spin_unlock_irq(&ic->endio_wait.lock);
1675
1676         if (unlikely(journal_read_pos != NOT_FOUND)) {
1677                 journal_section = journal_read_pos / ic->journal_section_entries;
1678                 journal_entry = journal_read_pos % ic->journal_section_entries;
1679                 goto journal_read_write;
1680         }
1681
1682         dio->in_flight = (atomic_t)ATOMIC_INIT(2);
1683
1684         if (need_sync_io) {
1685                 init_completion(&read_comp);
1686                 dio->completion = &read_comp;
1687         } else
1688                 dio->completion = NULL;
1689
1690         dio->orig_bi_iter = bio->bi_iter;
1691
1692         dio->orig_bi_disk = bio->bi_disk;
1693         dio->orig_bi_partno = bio->bi_partno;
1694         bio_set_dev(bio, ic->dev->bdev);
1695
1696         dio->orig_bi_integrity = bio_integrity(bio);
1697         bio->bi_integrity = NULL;
1698         bio->bi_opf &= ~REQ_INTEGRITY;
1699
1700         dio->orig_bi_end_io = bio->bi_end_io;
1701         bio->bi_end_io = integrity_end_io;
1702
1703         bio->bi_iter.bi_size = dio->range.n_sectors << SECTOR_SHIFT;
1704         bio->bi_iter.bi_sector += ic->start;
1705         generic_make_request(bio);
1706
1707         if (need_sync_io) {
1708                 wait_for_completion_io(&read_comp);
1709                 if (likely(!bio->bi_status))
1710                         integrity_metadata(&dio->work);
1711                 else
1712                         dec_in_flight(dio);
1713
1714         } else {
1715                 INIT_WORK(&dio->work, integrity_metadata);
1716                 queue_work(ic->metadata_wq, &dio->work);
1717         }
1718
1719         return;
1720
1721 journal_read_write:
1722         if (unlikely(__journal_read_write(dio, bio, journal_section, journal_entry)))
1723                 goto lock_retry;
1724
1725         do_endio_flush(ic, dio);
1726 }
1727
1728
1729 static void integrity_bio_wait(struct work_struct *w)
1730 {
1731         struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
1732
1733         dm_integrity_map_continue(dio, false);
1734 }
1735
1736 static void pad_uncommitted(struct dm_integrity_c *ic)
1737 {
1738         if (ic->free_section_entry) {
1739                 ic->free_sectors -= ic->journal_section_entries - ic->free_section_entry;
1740                 ic->free_section_entry = 0;
1741                 ic->free_section++;
1742                 wraparound_section(ic, &ic->free_section);
1743                 ic->n_uncommitted_sections++;
1744         }
1745         WARN_ON(ic->journal_sections * ic->journal_section_entries !=
1746                 (ic->n_uncommitted_sections + ic->n_committed_sections) * ic->journal_section_entries + ic->free_sectors);
1747 }
1748
1749 static void integrity_commit(struct work_struct *w)
1750 {
1751         struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, commit_work);
1752         unsigned commit_start, commit_sections;
1753         unsigned i, j, n;
1754         struct bio *flushes;
1755
1756         del_timer(&ic->autocommit_timer);
1757
1758         spin_lock_irq(&ic->endio_wait.lock);
1759         flushes = bio_list_get(&ic->flush_bio_list);
1760         if (unlikely(ic->mode != 'J')) {
1761                 spin_unlock_irq(&ic->endio_wait.lock);
1762                 dm_integrity_flush_buffers(ic);
1763                 goto release_flush_bios;
1764         }
1765
1766         pad_uncommitted(ic);
1767         commit_start = ic->uncommitted_section;
1768         commit_sections = ic->n_uncommitted_sections;
1769         spin_unlock_irq(&ic->endio_wait.lock);
1770
1771         if (!commit_sections)
1772                 goto release_flush_bios;
1773
1774         i = commit_start;
1775         for (n = 0; n < commit_sections; n++) {
1776                 for (j = 0; j < ic->journal_section_entries; j++) {
1777                         struct journal_entry *je;
1778                         je = access_journal_entry(ic, i, j);
1779                         io_wait_event(ic->copy_to_journal_wait, !journal_entry_is_inprogress(je));
1780                 }
1781                 for (j = 0; j < ic->journal_section_sectors; j++) {
1782                         struct journal_sector *js;
1783                         js = access_journal(ic, i, j);
1784                         js->commit_id = dm_integrity_commit_id(ic, i, j, ic->commit_seq);
1785                 }
1786                 i++;
1787                 if (unlikely(i >= ic->journal_sections))
1788                         ic->commit_seq = next_commit_seq(ic->commit_seq);
1789                 wraparound_section(ic, &i);
1790         }
1791         smp_rmb();
1792
1793         write_journal(ic, commit_start, commit_sections);
1794
1795         spin_lock_irq(&ic->endio_wait.lock);
1796         ic->uncommitted_section += commit_sections;
1797         wraparound_section(ic, &ic->uncommitted_section);
1798         ic->n_uncommitted_sections -= commit_sections;
1799         ic->n_committed_sections += commit_sections;
1800         spin_unlock_irq(&ic->endio_wait.lock);
1801
1802         if (ACCESS_ONCE(ic->free_sectors) <= ic->free_sectors_threshold)
1803                 queue_work(ic->writer_wq, &ic->writer_work);
1804
1805 release_flush_bios:
1806         while (flushes) {
1807                 struct bio *next = flushes->bi_next;
1808                 flushes->bi_next = NULL;
1809                 do_endio(ic, flushes);
1810                 flushes = next;
1811         }
1812 }
1813
1814 static void complete_copy_from_journal(unsigned long error, void *context)
1815 {
1816         struct journal_io *io = context;
1817         struct journal_completion *comp = io->comp;
1818         struct dm_integrity_c *ic = comp->ic;
1819         remove_range(ic, &io->range);
1820         mempool_free(io, ic->journal_io_mempool);
1821         if (unlikely(error != 0))
1822                 dm_integrity_io_error(ic, "copying from journal", -EIO);
1823         complete_journal_op(comp);
1824 }
1825
1826 static void restore_last_bytes(struct dm_integrity_c *ic, struct journal_sector *js,
1827                                struct journal_entry *je)
1828 {
1829         unsigned s = 0;
1830         do {
1831                 js->commit_id = je->last_bytes[s];
1832                 js++;
1833         } while (++s < ic->sectors_per_block);
1834 }
1835
1836 static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
1837                              unsigned write_sections, bool from_replay)
1838 {
1839         unsigned i, j, n;
1840         struct journal_completion comp;
1841         struct blk_plug plug;
1842
1843         blk_start_plug(&plug);
1844
1845         comp.ic = ic;
1846         comp.in_flight = (atomic_t)ATOMIC_INIT(1);
1847         init_completion(&comp.comp);
1848
1849         i = write_start;
1850         for (n = 0; n < write_sections; n++, i++, wraparound_section(ic, &i)) {
1851 #ifndef INTERNAL_VERIFY
1852                 if (unlikely(from_replay))
1853 #endif
1854                         rw_section_mac(ic, i, false);
1855                 for (j = 0; j < ic->journal_section_entries; j++) {
1856                         struct journal_entry *je = access_journal_entry(ic, i, j);
1857                         sector_t sec, area, offset;
1858                         unsigned k, l, next_loop;
1859                         sector_t metadata_block;
1860                         unsigned metadata_offset;
1861                         struct journal_io *io;
1862
1863                         if (journal_entry_is_unused(je))
1864                                 continue;
1865                         BUG_ON(unlikely(journal_entry_is_inprogress(je)) && !from_replay);
1866                         sec = journal_entry_get_sector(je);
1867                         if (unlikely(from_replay)) {
1868                                 if (unlikely(sec & (unsigned)(ic->sectors_per_block - 1))) {
1869                                         dm_integrity_io_error(ic, "invalid sector in journal", -EIO);
1870                                         sec &= ~(sector_t)(ic->sectors_per_block - 1);
1871                                 }
1872                         }
1873                         get_area_and_offset(ic, sec, &area, &offset);
1874                         restore_last_bytes(ic, access_journal_data(ic, i, j), je);
1875                         for (k = j + 1; k < ic->journal_section_entries; k++) {
1876                                 struct journal_entry *je2 = access_journal_entry(ic, i, k);
1877                                 sector_t sec2, area2, offset2;
1878                                 if (journal_entry_is_unused(je2))
1879                                         break;
1880                                 BUG_ON(unlikely(journal_entry_is_inprogress(je2)) && !from_replay);
1881                                 sec2 = journal_entry_get_sector(je2);
1882                                 get_area_and_offset(ic, sec2, &area2, &offset2);
1883                                 if (area2 != area || offset2 != offset + ((k - j) << ic->sb->log2_sectors_per_block))
1884                                         break;
1885                                 restore_last_bytes(ic, access_journal_data(ic, i, k), je2);
1886                         }
1887                         next_loop = k - 1;
1888
1889                         io = mempool_alloc(ic->journal_io_mempool, GFP_NOIO);
1890                         io->comp = &comp;
1891                         io->range.logical_sector = sec;
1892                         io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block;
1893
1894                         spin_lock_irq(&ic->endio_wait.lock);
1895                         while (unlikely(!add_new_range(ic, &io->range)))
1896                                 sleep_on_endio_wait(ic);
1897
1898                         if (likely(!from_replay)) {
1899                                 struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries];
1900
1901                                 /* don't write if there is newer committed sector */
1902                                 while (j < k && find_newer_committed_node(ic, &section_node[j])) {
1903                                         struct journal_entry *je2 = access_journal_entry(ic, i, j);
1904
1905                                         journal_entry_set_unused(je2);
1906                                         remove_journal_node(ic, &section_node[j]);
1907                                         j++;
1908                                         sec += ic->sectors_per_block;
1909                                         offset += ic->sectors_per_block;
1910                                 }
1911                                 while (j < k && find_newer_committed_node(ic, &section_node[k - 1])) {
1912                                         struct journal_entry *je2 = access_journal_entry(ic, i, k - 1);
1913
1914                                         journal_entry_set_unused(je2);
1915                                         remove_journal_node(ic, &section_node[k - 1]);
1916                                         k--;
1917                                 }
1918                                 if (j == k) {
1919                                         remove_range_unlocked(ic, &io->range);
1920                                         spin_unlock_irq(&ic->endio_wait.lock);
1921                                         mempool_free(io, ic->journal_io_mempool);
1922                                         goto skip_io;
1923                                 }
1924                                 for (l = j; l < k; l++) {
1925                                         remove_journal_node(ic, &section_node[l]);
1926                                 }
1927                         }
1928                         spin_unlock_irq(&ic->endio_wait.lock);
1929
1930                         metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset);
1931                         for (l = j; l < k; l++) {
1932                                 int r;
1933                                 struct journal_entry *je2 = access_journal_entry(ic, i, l);
1934
1935                                 if (
1936 #ifndef INTERNAL_VERIFY
1937                                     unlikely(from_replay) &&
1938 #endif
1939                                     ic->internal_hash) {
1940                                         char test_tag[max(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)];
1941
1942                                         integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block),
1943                                                                   (char *)access_journal_data(ic, i, l), test_tag);
1944                                         if (unlikely(memcmp(test_tag, journal_entry_tag(ic, je2), ic->tag_size)))
1945                                                 dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ);
1946                                 }
1947
1948                                 journal_entry_set_unused(je2);
1949                                 r = dm_integrity_rw_tag(ic, journal_entry_tag(ic, je2), &metadata_block, &metadata_offset,
1950                                                         ic->tag_size, TAG_WRITE);
1951                                 if (unlikely(r)) {
1952                                         dm_integrity_io_error(ic, "reading tags", r);
1953                                 }
1954                         }
1955
1956                         atomic_inc(&comp.in_flight);
1957                         copy_from_journal(ic, i, j << ic->sb->log2_sectors_per_block,
1958                                           (k - j) << ic->sb->log2_sectors_per_block,
1959                                           get_data_sector(ic, area, offset),
1960                                           complete_copy_from_journal, io);
1961 skip_io:
1962                         j = next_loop;
1963                 }
1964         }
1965
1966         dm_bufio_write_dirty_buffers_async(ic->bufio);
1967
1968         blk_finish_plug(&plug);
1969
1970         complete_journal_op(&comp);
1971         wait_for_completion_io(&comp.comp);
1972
1973         dm_integrity_flush_buffers(ic);
1974 }
1975
1976 static void integrity_writer(struct work_struct *w)
1977 {
1978         struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, writer_work);
1979         unsigned write_start, write_sections;
1980
1981         unsigned prev_free_sectors;
1982
1983         /* the following test is not needed, but it tests the replay code */
1984         if (ACCESS_ONCE(ic->suspending))
1985                 return;
1986
1987         spin_lock_irq(&ic->endio_wait.lock);
1988         write_start = ic->committed_section;
1989         write_sections = ic->n_committed_sections;
1990         spin_unlock_irq(&ic->endio_wait.lock);
1991
1992         if (!write_sections)
1993                 return;
1994
1995         do_journal_write(ic, write_start, write_sections, false);
1996
1997         spin_lock_irq(&ic->endio_wait.lock);
1998
1999         ic->committed_section += write_sections;
2000         wraparound_section(ic, &ic->committed_section);
2001         ic->n_committed_sections -= write_sections;
2002
2003         prev_free_sectors = ic->free_sectors;
2004         ic->free_sectors += write_sections * ic->journal_section_entries;
2005         if (unlikely(!prev_free_sectors))
2006                 wake_up_locked(&ic->endio_wait);
2007
2008         spin_unlock_irq(&ic->endio_wait.lock);
2009 }
2010
2011 static void init_journal(struct dm_integrity_c *ic, unsigned start_section,
2012                          unsigned n_sections, unsigned char commit_seq)
2013 {
2014         unsigned i, j, n;
2015
2016         if (!n_sections)
2017                 return;
2018
2019         for (n = 0; n < n_sections; n++) {
2020                 i = start_section + n;
2021                 wraparound_section(ic, &i);
2022                 for (j = 0; j < ic->journal_section_sectors; j++) {
2023                         struct journal_sector *js = access_journal(ic, i, j);
2024                         memset(&js->entries, 0, JOURNAL_SECTOR_DATA);
2025                         js->commit_id = dm_integrity_commit_id(ic, i, j, commit_seq);
2026                 }
2027                 for (j = 0; j < ic->journal_section_entries; j++) {
2028                         struct journal_entry *je = access_journal_entry(ic, i, j);
2029                         journal_entry_set_unused(je);
2030                 }
2031         }
2032
2033         write_journal(ic, start_section, n_sections);
2034 }
2035
2036 static int find_commit_seq(struct dm_integrity_c *ic, unsigned i, unsigned j, commit_id_t id)
2037 {
2038         unsigned char k;
2039         for (k = 0; k < N_COMMIT_IDS; k++) {
2040                 if (dm_integrity_commit_id(ic, i, j, k) == id)
2041                         return k;
2042         }
2043         dm_integrity_io_error(ic, "journal commit id", -EIO);
2044         return -EIO;
2045 }
2046
2047 static void replay_journal(struct dm_integrity_c *ic)
2048 {
2049         unsigned i, j;
2050         bool used_commit_ids[N_COMMIT_IDS];
2051         unsigned max_commit_id_sections[N_COMMIT_IDS];
2052         unsigned write_start, write_sections;
2053         unsigned continue_section;
2054         bool journal_empty;
2055         unsigned char unused, last_used, want_commit_seq;
2056
2057         if (ic->mode == 'R')
2058                 return;
2059
2060         if (ic->journal_uptodate)
2061                 return;
2062
2063         last_used = 0;
2064         write_start = 0;
2065
2066         if (!ic->just_formatted) {
2067                 DEBUG_print("reading journal\n");
2068                 rw_journal(ic, REQ_OP_READ, 0, 0, ic->journal_sections, NULL);
2069                 if (ic->journal_io)
2070                         DEBUG_bytes(lowmem_page_address(ic->journal_io[0].page), 64, "read journal");
2071                 if (ic->journal_io) {
2072                         struct journal_completion crypt_comp;
2073                         crypt_comp.ic = ic;
2074                         init_completion(&crypt_comp.comp);
2075                         crypt_comp.in_flight = (atomic_t)ATOMIC_INIT(0);
2076                         encrypt_journal(ic, false, 0, ic->journal_sections, &crypt_comp);
2077                         wait_for_completion(&crypt_comp.comp);
2078                 }
2079                 DEBUG_bytes(lowmem_page_address(ic->journal[0].page), 64, "decrypted journal");
2080         }
2081
2082         if (dm_integrity_failed(ic))
2083                 goto clear_journal;
2084
2085         journal_empty = true;
2086         memset(used_commit_ids, 0, sizeof used_commit_ids);
2087         memset(max_commit_id_sections, 0, sizeof max_commit_id_sections);
2088         for (i = 0; i < ic->journal_sections; i++) {
2089                 for (j = 0; j < ic->journal_section_sectors; j++) {
2090                         int k;
2091                         struct journal_sector *js = access_journal(ic, i, j);
2092                         k = find_commit_seq(ic, i, j, js->commit_id);
2093                         if (k < 0)
2094                                 goto clear_journal;
2095                         used_commit_ids[k] = true;
2096                         max_commit_id_sections[k] = i;
2097                 }
2098                 if (journal_empty) {
2099                         for (j = 0; j < ic->journal_section_entries; j++) {
2100                                 struct journal_entry *je = access_journal_entry(ic, i, j);
2101                                 if (!journal_entry_is_unused(je)) {
2102                                         journal_empty = false;
2103                                         break;
2104                                 }
2105                         }
2106                 }
2107         }
2108
2109         if (!used_commit_ids[N_COMMIT_IDS - 1]) {
2110                 unused = N_COMMIT_IDS - 1;
2111                 while (unused && !used_commit_ids[unused - 1])
2112                         unused--;
2113         } else {
2114                 for (unused = 0; unused < N_COMMIT_IDS; unused++)
2115                         if (!used_commit_ids[unused])
2116                                 break;
2117                 if (unused == N_COMMIT_IDS) {
2118                         dm_integrity_io_error(ic, "journal commit ids", -EIO);
2119                         goto clear_journal;
2120                 }
2121         }
2122         DEBUG_print("first unused commit seq %d [%d,%d,%d,%d]\n",
2123                     unused, used_commit_ids[0], used_commit_ids[1],
2124                     used_commit_ids[2], used_commit_ids[3]);
2125
2126         last_used = prev_commit_seq(unused);
2127         want_commit_seq = prev_commit_seq(last_used);
2128
2129         if (!used_commit_ids[want_commit_seq] && used_commit_ids[prev_commit_seq(want_commit_seq)])
2130                 journal_empty = true;
2131
2132         write_start = max_commit_id_sections[last_used] + 1;
2133         if (unlikely(write_start >= ic->journal_sections))
2134                 want_commit_seq = next_commit_seq(want_commit_seq);
2135         wraparound_section(ic, &write_start);
2136
2137         i = write_start;
2138         for (write_sections = 0; write_sections < ic->journal_sections; write_sections++) {
2139                 for (j = 0; j < ic->journal_section_sectors; j++) {
2140                         struct journal_sector *js = access_journal(ic, i, j);
2141
2142                         if (js->commit_id != dm_integrity_commit_id(ic, i, j, want_commit_seq)) {
2143                                 /*
2144                                  * This could be caused by crash during writing.
2145                                  * We won't replay the inconsistent part of the
2146                                  * journal.
2147                                  */
2148                                 DEBUG_print("commit id mismatch at position (%u, %u): %d != %d\n",
2149                                             i, j, find_commit_seq(ic, i, j, js->commit_id), want_commit_seq);
2150                                 goto brk;
2151                         }
2152                 }
2153                 i++;
2154                 if (unlikely(i >= ic->journal_sections))
2155                         want_commit_seq = next_commit_seq(want_commit_seq);
2156                 wraparound_section(ic, &i);
2157         }
2158 brk:
2159
2160         if (!journal_empty) {
2161                 DEBUG_print("replaying %u sections, starting at %u, commit seq %d\n",
2162                             write_sections, write_start, want_commit_seq);
2163                 do_journal_write(ic, write_start, write_sections, true);
2164         }
2165
2166         if (write_sections == ic->journal_sections && (ic->mode == 'J' || journal_empty)) {
2167                 continue_section = write_start;
2168                 ic->commit_seq = want_commit_seq;
2169                 DEBUG_print("continuing from section %u, commit seq %d\n", write_start, ic->commit_seq);
2170         } else {
2171                 unsigned s;
2172                 unsigned char erase_seq;
2173 clear_journal:
2174                 DEBUG_print("clearing journal\n");
2175
2176                 erase_seq = prev_commit_seq(prev_commit_seq(last_used));
2177                 s = write_start;
2178                 init_journal(ic, s, 1, erase_seq);
2179                 s++;
2180                 wraparound_section(ic, &s);
2181                 if (ic->journal_sections >= 2) {
2182                         init_journal(ic, s, ic->journal_sections - 2, erase_seq);
2183                         s += ic->journal_sections - 2;
2184                         wraparound_section(ic, &s);
2185                         init_journal(ic, s, 1, erase_seq);
2186                 }
2187
2188                 continue_section = 0;
2189                 ic->commit_seq = next_commit_seq(erase_seq);
2190         }
2191
2192         ic->committed_section = continue_section;
2193         ic->n_committed_sections = 0;
2194
2195         ic->uncommitted_section = continue_section;
2196         ic->n_uncommitted_sections = 0;
2197
2198         ic->free_section = continue_section;
2199         ic->free_section_entry = 0;
2200         ic->free_sectors = ic->journal_entries;
2201
2202         ic->journal_tree_root = RB_ROOT;
2203         for (i = 0; i < ic->journal_entries; i++)
2204                 init_journal_node(&ic->journal_tree[i]);
2205 }
2206
2207 static void dm_integrity_postsuspend(struct dm_target *ti)
2208 {
2209         struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
2210
2211         del_timer_sync(&ic->autocommit_timer);
2212
2213         WRITE_ONCE(ic->suspending, 1);
2214
2215         queue_work(ic->commit_wq, &ic->commit_work);
2216         drain_workqueue(ic->commit_wq);
2217
2218         if (ic->mode == 'J') {
2219                 drain_workqueue(ic->writer_wq);
2220                 dm_integrity_flush_buffers(ic);
2221         }
2222
2223         WRITE_ONCE(ic->suspending, 0);
2224
2225         BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
2226
2227         ic->journal_uptodate = true;
2228 }
2229
2230 static void dm_integrity_resume(struct dm_target *ti)
2231 {
2232         struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
2233
2234         replay_journal(ic);
2235 }
2236
2237 static void dm_integrity_status(struct dm_target *ti, status_type_t type,
2238                                 unsigned status_flags, char *result, unsigned maxlen)
2239 {
2240         struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
2241         unsigned arg_count;
2242         size_t sz = 0;
2243
2244         switch (type) {
2245         case STATUSTYPE_INFO:
2246                 DMEMIT("%llu", (unsigned long long)atomic64_read(&ic->number_of_mismatches));
2247                 break;
2248
2249         case STATUSTYPE_TABLE: {
2250                 __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100;
2251                 watermark_percentage += ic->journal_entries / 2;
2252                 do_div(watermark_percentage, ic->journal_entries);
2253                 arg_count = 5;
2254                 arg_count += ic->sectors_per_block != 1;
2255                 arg_count += !!ic->internal_hash_alg.alg_string;
2256                 arg_count += !!ic->journal_crypt_alg.alg_string;
2257                 arg_count += !!ic->journal_mac_alg.alg_string;
2258                 DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start,
2259                        ic->tag_size, ic->mode, arg_count);
2260                 DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS);
2261                 DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors);
2262                 DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors);
2263                 DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage);
2264                 DMEMIT(" commit_time:%u", ic->autocommit_msec);
2265                 if (ic->sectors_per_block != 1)
2266                         DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT);
2267
2268 #define EMIT_ALG(a, n)                                                  \
2269                 do {                                                    \
2270                         if (ic->a.alg_string) {                         \
2271                                 DMEMIT(" %s:%s", n, ic->a.alg_string);  \
2272                                 if (ic->a.key_string)                   \
2273                                         DMEMIT(":%s", ic->a.key_string);\
2274                         }                                               \
2275                 } while (0)
2276                 EMIT_ALG(internal_hash_alg, "internal_hash");
2277                 EMIT_ALG(journal_crypt_alg, "journal_crypt");
2278                 EMIT_ALG(journal_mac_alg, "journal_mac");
2279                 break;
2280         }
2281         }
2282 }
2283
2284 static int dm_integrity_iterate_devices(struct dm_target *ti,
2285                                         iterate_devices_callout_fn fn, void *data)
2286 {
2287         struct dm_integrity_c *ic = ti->private;
2288
2289         return fn(ti, ic->dev, ic->start + ic->initial_sectors + ic->metadata_run, ti->len, data);
2290 }
2291
2292 static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *limits)
2293 {
2294         struct dm_integrity_c *ic = ti->private;
2295
2296         if (ic->sectors_per_block > 1) {
2297                 limits->logical_block_size = ic->sectors_per_block << SECTOR_SHIFT;
2298                 limits->physical_block_size = ic->sectors_per_block << SECTOR_SHIFT;
2299                 blk_limits_io_min(limits, ic->sectors_per_block << SECTOR_SHIFT);
2300         }
2301 }
2302
2303 static void calculate_journal_section_size(struct dm_integrity_c *ic)
2304 {
2305         unsigned sector_space = JOURNAL_SECTOR_DATA;
2306
2307         ic->journal_sections = le32_to_cpu(ic->sb->journal_sections);
2308         ic->journal_entry_size = roundup(offsetof(struct journal_entry, last_bytes[ic->sectors_per_block]) + ic->tag_size,
2309                                          JOURNAL_ENTRY_ROUNDUP);
2310
2311         if (ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC))
2312                 sector_space -= JOURNAL_MAC_PER_SECTOR;
2313         ic->journal_entries_per_sector = sector_space / ic->journal_entry_size;
2314         ic->journal_section_entries = ic->journal_entries_per_sector * JOURNAL_BLOCK_SECTORS;
2315         ic->journal_section_sectors = (ic->journal_section_entries << ic->sb->log2_sectors_per_block) + JOURNAL_BLOCK_SECTORS;
2316         ic->journal_entries = ic->journal_section_entries * ic->journal_sections;
2317 }
2318
2319 static int calculate_device_limits(struct dm_integrity_c *ic)
2320 {
2321         __u64 initial_sectors;
2322         sector_t last_sector, last_area, last_offset;
2323
2324         calculate_journal_section_size(ic);
2325         initial_sectors = SB_SECTORS + (__u64)ic->journal_section_sectors * ic->journal_sections;
2326         if (initial_sectors + METADATA_PADDING_SECTORS >= ic->device_sectors || initial_sectors > UINT_MAX)
2327                 return -EINVAL;
2328         ic->initial_sectors = initial_sectors;
2329
2330         ic->metadata_run = roundup((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block),
2331                                    (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS)) >> SECTOR_SHIFT;
2332         if (!(ic->metadata_run & (ic->metadata_run - 1)))
2333                 ic->log2_metadata_run = __ffs(ic->metadata_run);
2334         else
2335                 ic->log2_metadata_run = -1;
2336
2337         get_area_and_offset(ic, ic->provided_data_sectors - 1, &last_area, &last_offset);
2338         last_sector = get_data_sector(ic, last_area, last_offset);
2339
2340         if (ic->start + last_sector < last_sector || ic->start + last_sector >= ic->device_sectors)
2341                 return -EINVAL;
2342
2343         return 0;
2344 }
2345
2346 static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sectors, unsigned interleave_sectors)
2347 {
2348         unsigned journal_sections;
2349         int test_bit;
2350
2351         memset(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT);
2352         memcpy(ic->sb->magic, SB_MAGIC, 8);
2353         ic->sb->version = SB_VERSION;
2354         ic->sb->integrity_tag_size = cpu_to_le16(ic->tag_size);
2355         ic->sb->log2_sectors_per_block = __ffs(ic->sectors_per_block);
2356         if (ic->journal_mac_alg.alg_string)
2357                 ic->sb->flags |= cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC);
2358
2359         calculate_journal_section_size(ic);
2360         journal_sections = journal_sectors / ic->journal_section_sectors;
2361         if (!journal_sections)
2362                 journal_sections = 1;
2363         ic->sb->journal_sections = cpu_to_le32(journal_sections);
2364
2365         if (!interleave_sectors)
2366                 interleave_sectors = DEFAULT_INTERLEAVE_SECTORS;
2367         ic->sb->log2_interleave_sectors = __fls(interleave_sectors);
2368         ic->sb->log2_interleave_sectors = max((__u8)MIN_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
2369         ic->sb->log2_interleave_sectors = min((__u8)MAX_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
2370
2371         ic->provided_data_sectors = 0;
2372         for (test_bit = fls64(ic->device_sectors) - 1; test_bit >= 3; test_bit--) {
2373                 __u64 prev_data_sectors = ic->provided_data_sectors;
2374
2375                 ic->provided_data_sectors |= (sector_t)1 << test_bit;
2376                 if (calculate_device_limits(ic))
2377                         ic->provided_data_sectors = prev_data_sectors;
2378         }
2379
2380         if (!ic->provided_data_sectors)
2381                 return -EINVAL;
2382
2383         ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors);
2384
2385         return 0;
2386 }
2387
2388 static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic)
2389 {
2390         struct gendisk *disk = dm_disk(dm_table_get_md(ti->table));
2391         struct blk_integrity bi;
2392
2393         memset(&bi, 0, sizeof(bi));
2394         bi.profile = &dm_integrity_profile;
2395         bi.tuple_size = ic->tag_size;
2396         bi.tag_size = bi.tuple_size;
2397         bi.interval_exp = ic->sb->log2_sectors_per_block + SECTOR_SHIFT;
2398
2399         blk_integrity_register(disk, &bi);
2400         blk_queue_max_integrity_segments(disk->queue, UINT_MAX);
2401 }
2402
2403 static void dm_integrity_free_page_list(struct dm_integrity_c *ic, struct page_list *pl)
2404 {
2405         unsigned i;
2406
2407         if (!pl)
2408                 return;
2409         for (i = 0; i < ic->journal_pages; i++)
2410                 if (pl[i].page)
2411                         __free_page(pl[i].page);
2412         kvfree(pl);
2413 }
2414
2415 static struct page_list *dm_integrity_alloc_page_list(struct dm_integrity_c *ic)
2416 {
2417         size_t page_list_desc_size = ic->journal_pages * sizeof(struct page_list);
2418         struct page_list *pl;
2419         unsigned i;
2420
2421         pl = kvmalloc(page_list_desc_size, GFP_KERNEL | __GFP_ZERO);
2422         if (!pl)
2423                 return NULL;
2424
2425         for (i = 0; i < ic->journal_pages; i++) {
2426                 pl[i].page = alloc_page(GFP_KERNEL);
2427                 if (!pl[i].page) {
2428                         dm_integrity_free_page_list(ic, pl);
2429                         return NULL;
2430                 }
2431                 if (i)
2432                         pl[i - 1].next = &pl[i];
2433         }
2434
2435         return pl;
2436 }
2437
2438 static void dm_integrity_free_journal_scatterlist(struct dm_integrity_c *ic, struct scatterlist **sl)
2439 {
2440         unsigned i;
2441         for (i = 0; i < ic->journal_sections; i++)
2442                 kvfree(sl[i]);
2443         kvfree(sl);
2444 }
2445
2446 static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic, struct page_list *pl)
2447 {
2448         struct scatterlist **sl;
2449         unsigned i;
2450
2451         sl = kvmalloc(ic->journal_sections * sizeof(struct scatterlist *), GFP_KERNEL | __GFP_ZERO);
2452         if (!sl)
2453                 return NULL;
2454
2455         for (i = 0; i < ic->journal_sections; i++) {
2456                 struct scatterlist *s;
2457                 unsigned start_index, start_offset;
2458                 unsigned end_index, end_offset;
2459                 unsigned n_pages;
2460                 unsigned idx;
2461
2462                 page_list_location(ic, i, 0, &start_index, &start_offset);
2463                 page_list_location(ic, i, ic->journal_section_sectors - 1, &end_index, &end_offset);
2464
2465                 n_pages = (end_index - start_index + 1);
2466
2467                 s = kvmalloc(n_pages * sizeof(struct scatterlist), GFP_KERNEL);
2468                 if (!s) {
2469                         dm_integrity_free_journal_scatterlist(ic, sl);
2470                         return NULL;
2471                 }
2472
2473                 sg_init_table(s, n_pages);
2474                 for (idx = start_index; idx <= end_index; idx++) {
2475                         char *va = lowmem_page_address(pl[idx].page);
2476                         unsigned start = 0, end = PAGE_SIZE;
2477                         if (idx == start_index)
2478                                 start = start_offset;
2479                         if (idx == end_index)
2480                                 end = end_offset + (1 << SECTOR_SHIFT);
2481                         sg_set_buf(&s[idx - start_index], va + start, end - start);
2482                 }
2483
2484                 sl[i] = s;
2485         }
2486
2487         return sl;
2488 }
2489
2490 static void free_alg(struct alg_spec *a)
2491 {
2492         kzfree(a->alg_string);
2493         kzfree(a->key);
2494         memset(a, 0, sizeof *a);
2495 }
2496
2497 static int get_alg_and_key(const char *arg, struct alg_spec *a, char **error, char *error_inval)
2498 {
2499         char *k;
2500
2501         free_alg(a);
2502
2503         a->alg_string = kstrdup(strchr(arg, ':') + 1, GFP_KERNEL);
2504         if (!a->alg_string)
2505                 goto nomem;
2506
2507         k = strchr(a->alg_string, ':');
2508         if (k) {
2509                 *k = 0;
2510                 a->key_string = k + 1;
2511                 if (strlen(a->key_string) & 1)
2512                         goto inval;
2513
2514                 a->key_size = strlen(a->key_string) / 2;
2515                 a->key = kmalloc(a->key_size, GFP_KERNEL);
2516                 if (!a->key)
2517                         goto nomem;
2518                 if (hex2bin(a->key, a->key_string, a->key_size))
2519                         goto inval;
2520         }
2521
2522         return 0;
2523 inval:
2524         *error = error_inval;
2525         return -EINVAL;
2526 nomem:
2527         *error = "Out of memory for an argument";
2528         return -ENOMEM;
2529 }
2530
2531 static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error,
2532                    char *error_alg, char *error_key)
2533 {
2534         int r;
2535
2536         if (a->alg_string) {
2537                 *hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ASYNC);
2538                 if (IS_ERR(*hash)) {
2539                         *error = error_alg;
2540                         r = PTR_ERR(*hash);
2541                         *hash = NULL;
2542                         return r;
2543                 }
2544
2545                 if (a->key) {
2546                         r = crypto_shash_setkey(*hash, a->key, a->key_size);
2547                         if (r) {
2548                                 *error = error_key;
2549                                 return r;
2550                         }
2551                 } else if (crypto_shash_get_flags(*hash) & CRYPTO_TFM_NEED_KEY) {
2552                         *error = error_key;
2553                         return -ENOKEY;
2554                 }
2555         }
2556
2557         return 0;
2558 }
2559
2560 static int create_journal(struct dm_integrity_c *ic, char **error)
2561 {
2562         int r = 0;
2563         unsigned i;
2564         __u64 journal_pages, journal_desc_size, journal_tree_size;
2565         unsigned char *crypt_data = NULL, *crypt_iv = NULL;
2566         struct skcipher_request *req = NULL;
2567
2568         ic->commit_ids[0] = cpu_to_le64(0x1111111111111111ULL);
2569         ic->commit_ids[1] = cpu_to_le64(0x2222222222222222ULL);
2570         ic->commit_ids[2] = cpu_to_le64(0x3333333333333333ULL);
2571         ic->commit_ids[3] = cpu_to_le64(0x4444444444444444ULL);
2572
2573         journal_pages = roundup((__u64)ic->journal_sections * ic->journal_section_sectors,
2574                                 PAGE_SIZE >> SECTOR_SHIFT) >> (PAGE_SHIFT - SECTOR_SHIFT);
2575         journal_desc_size = journal_pages * sizeof(struct page_list);
2576         if (journal_pages >= totalram_pages - totalhigh_pages || journal_desc_size > ULONG_MAX) {
2577                 *error = "Journal doesn't fit into memory";
2578                 r = -ENOMEM;
2579                 goto bad;
2580         }
2581         ic->journal_pages = journal_pages;
2582
2583         ic->journal = dm_integrity_alloc_page_list(ic);
2584         if (!ic->journal) {
2585                 *error = "Could not allocate memory for journal";
2586                 r = -ENOMEM;
2587                 goto bad;
2588         }
2589         if (ic->journal_crypt_alg.alg_string) {
2590                 unsigned ivsize, blocksize;
2591                 struct journal_completion comp;
2592
2593                 comp.ic = ic;
2594                 ic->journal_crypt = crypto_alloc_skcipher(ic->journal_crypt_alg.alg_string, 0, 0);
2595                 if (IS_ERR(ic->journal_crypt)) {
2596                         *error = "Invalid journal cipher";
2597                         r = PTR_ERR(ic->journal_crypt);
2598                         ic->journal_crypt = NULL;
2599                         goto bad;
2600                 }
2601                 ivsize = crypto_skcipher_ivsize(ic->journal_crypt);
2602                 blocksize = crypto_skcipher_blocksize(ic->journal_crypt);
2603
2604                 if (ic->journal_crypt_alg.key) {
2605                         r = crypto_skcipher_setkey(ic->journal_crypt, ic->journal_crypt_alg.key,
2606                                                    ic->journal_crypt_alg.key_size);
2607                         if (r) {
2608                                 *error = "Error setting encryption key";
2609                                 goto bad;
2610                         }
2611                 }
2612                 DEBUG_print("cipher %s, block size %u iv size %u\n",
2613                             ic->journal_crypt_alg.alg_string, blocksize, ivsize);
2614
2615                 ic->journal_io = dm_integrity_alloc_page_list(ic);
2616                 if (!ic->journal_io) {
2617                         *error = "Could not allocate memory for journal io";
2618                         r = -ENOMEM;
2619                         goto bad;
2620                 }
2621
2622                 if (blocksize == 1) {
2623                         struct scatterlist *sg;
2624
2625                         req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL);
2626                         if (!req) {
2627                                 *error = "Could not allocate crypt request";
2628                                 r = -ENOMEM;
2629                                 goto bad;
2630                         }
2631
2632                         crypt_iv = kmalloc(ivsize, GFP_KERNEL);
2633                         if (!crypt_iv) {
2634                                 *error = "Could not allocate iv";
2635                                 r = -ENOMEM;
2636                                 goto bad;
2637                         }
2638
2639                         ic->journal_xor = dm_integrity_alloc_page_list(ic);
2640                         if (!ic->journal_xor) {
2641                                 *error = "Could not allocate memory for journal xor";
2642                                 r = -ENOMEM;
2643                                 goto bad;
2644                         }
2645
2646                         sg = kvmalloc((ic->journal_pages + 1) * sizeof(struct scatterlist), GFP_KERNEL);
2647                         if (!sg) {
2648                                 *error = "Unable to allocate sg list";
2649                                 r = -ENOMEM;
2650                                 goto bad;
2651                         }
2652                         sg_init_table(sg, ic->journal_pages + 1);
2653                         for (i = 0; i < ic->journal_pages; i++) {
2654                                 char *va = lowmem_page_address(ic->journal_xor[i].page);
2655                                 clear_page(va);
2656                                 sg_set_buf(&sg[i], va, PAGE_SIZE);
2657                         }
2658                         sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids);
2659                         memset(crypt_iv, 0x00, ivsize);
2660
2661                         skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, crypt_iv);
2662                         init_completion(&comp.comp);
2663                         comp.in_flight = (atomic_t)ATOMIC_INIT(1);
2664                         if (do_crypt(true, req, &comp))
2665                                 wait_for_completion(&comp.comp);
2666                         kvfree(sg);
2667                         r = dm_integrity_failed(ic);
2668                         if (r) {
2669                                 *error = "Unable to encrypt journal";
2670                                 goto bad;
2671                         }
2672                         DEBUG_bytes(lowmem_page_address(ic->journal_xor[0].page), 64, "xor data");
2673
2674                         crypto_free_skcipher(ic->journal_crypt);
2675                         ic->journal_crypt = NULL;
2676                 } else {
2677                         unsigned crypt_len = roundup(ivsize, blocksize);
2678
2679                         req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL);
2680                         if (!req) {
2681                                 *error = "Could not allocate crypt request";
2682                                 r = -ENOMEM;
2683                                 goto bad;
2684                         }
2685
2686                         crypt_iv = kmalloc(ivsize, GFP_KERNEL);
2687                         if (!crypt_iv) {
2688                                 *error = "Could not allocate iv";
2689                                 r = -ENOMEM;
2690                                 goto bad;
2691                         }
2692
2693                         crypt_data = kmalloc(crypt_len, GFP_KERNEL);
2694                         if (!crypt_data) {
2695                                 *error = "Unable to allocate crypt data";
2696                                 r = -ENOMEM;
2697                                 goto bad;
2698                         }
2699
2700                         ic->journal_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal);
2701                         if (!ic->journal_scatterlist) {
2702                                 *error = "Unable to allocate sg list";
2703                                 r = -ENOMEM;
2704                                 goto bad;
2705                         }
2706                         ic->journal_io_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal_io);
2707                         if (!ic->journal_io_scatterlist) {
2708                                 *error = "Unable to allocate sg list";
2709                                 r = -ENOMEM;
2710                                 goto bad;
2711                         }
2712                         ic->sk_requests = kvmalloc(ic->journal_sections * sizeof(struct skcipher_request *), GFP_KERNEL | __GFP_ZERO);
2713                         if (!ic->sk_requests) {
2714                                 *error = "Unable to allocate sk requests";
2715                                 r = -ENOMEM;
2716                                 goto bad;
2717                         }
2718                         for (i = 0; i < ic->journal_sections; i++) {
2719                                 struct scatterlist sg;
2720                                 struct skcipher_request *section_req;
2721                                 __u32 section_le = cpu_to_le32(i);
2722
2723                                 memset(crypt_iv, 0x00, ivsize);
2724                                 memset(crypt_data, 0x00, crypt_len);
2725                                 memcpy(crypt_data, &section_le, min((size_t)crypt_len, sizeof(section_le)));
2726
2727                                 sg_init_one(&sg, crypt_data, crypt_len);
2728                                 skcipher_request_set_crypt(req, &sg, &sg, crypt_len, crypt_iv);
2729                                 init_completion(&comp.comp);
2730                                 comp.in_flight = (atomic_t)ATOMIC_INIT(1);
2731                                 if (do_crypt(true, req, &comp))
2732                                         wait_for_completion(&comp.comp);
2733
2734                                 r = dm_integrity_failed(ic);
2735                                 if (r) {
2736                                         *error = "Unable to generate iv";
2737                                         goto bad;
2738                                 }
2739
2740                                 section_req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL);
2741                                 if (!section_req) {
2742                                         *error = "Unable to allocate crypt request";
2743                                         r = -ENOMEM;
2744                                         goto bad;
2745                                 }
2746                                 section_req->iv = kmalloc(ivsize * 2, GFP_KERNEL);
2747                                 if (!section_req->iv) {
2748                                         skcipher_request_free(section_req);
2749                                         *error = "Unable to allocate iv";
2750                                         r = -ENOMEM;
2751                                         goto bad;
2752                                 }
2753                                 memcpy(section_req->iv + ivsize, crypt_data, ivsize);
2754                                 section_req->cryptlen = (size_t)ic->journal_section_sectors << SECTOR_SHIFT;
2755                                 ic->sk_requests[i] = section_req;
2756                                 DEBUG_bytes(crypt_data, ivsize, "iv(%u)", i);
2757                         }
2758                 }
2759         }
2760
2761         for (i = 0; i < N_COMMIT_IDS; i++) {
2762                 unsigned j;
2763 retest_commit_id:
2764                 for (j = 0; j < i; j++) {
2765                         if (ic->commit_ids[j] == ic->commit_ids[i]) {
2766                                 ic->commit_ids[i] = cpu_to_le64(le64_to_cpu(ic->commit_ids[i]) + 1);
2767                                 goto retest_commit_id;
2768                         }
2769                 }
2770                 DEBUG_print("commit id %u: %016llx\n", i, ic->commit_ids[i]);
2771         }
2772
2773         journal_tree_size = (__u64)ic->journal_entries * sizeof(struct journal_node);
2774         if (journal_tree_size > ULONG_MAX) {
2775                 *error = "Journal doesn't fit into memory";
2776                 r = -ENOMEM;
2777                 goto bad;
2778         }
2779         ic->journal_tree = kvmalloc(journal_tree_size, GFP_KERNEL);
2780         if (!ic->journal_tree) {
2781                 *error = "Could not allocate memory for journal tree";
2782                 r = -ENOMEM;
2783         }
2784 bad:
2785         kfree(crypt_data);
2786         kfree(crypt_iv);
2787         skcipher_request_free(req);
2788
2789         return r;
2790 }
2791
2792 /*
2793  * Construct a integrity mapping
2794  *
2795  * Arguments:
2796  *      device
2797  *      offset from the start of the device
2798  *      tag size
2799  *      D - direct writes, J - journal writes, R - recovery mode
2800  *      number of optional arguments
2801  *      optional arguments:
2802  *              journal_sectors
2803  *              interleave_sectors
2804  *              buffer_sectors
2805  *              journal_watermark
2806  *              commit_time
2807  *              internal_hash
2808  *              journal_crypt
2809  *              journal_mac
2810  *              block_size
2811  */
2812 static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
2813 {
2814         struct dm_integrity_c *ic;
2815         char dummy;
2816         int r;
2817         unsigned extra_args;
2818         struct dm_arg_set as;
2819         static const struct dm_arg _args[] = {
2820                 {0, 9, "Invalid number of feature args"},
2821         };
2822         unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec;
2823         bool should_write_sb;
2824         __u64 threshold;
2825         unsigned long long start;
2826
2827 #define DIRECT_ARGUMENTS        4
2828
2829         if (argc <= DIRECT_ARGUMENTS) {
2830                 ti->error = "Invalid argument count";
2831                 return -EINVAL;
2832         }
2833
2834         ic = kzalloc(sizeof(struct dm_integrity_c), GFP_KERNEL);
2835         if (!ic) {
2836                 ti->error = "Cannot allocate integrity context";
2837                 return -ENOMEM;
2838         }
2839         ti->private = ic;
2840         ti->per_io_data_size = sizeof(struct dm_integrity_io);
2841
2842         ic->in_progress = RB_ROOT;
2843         init_waitqueue_head(&ic->endio_wait);
2844         bio_list_init(&ic->flush_bio_list);
2845         init_waitqueue_head(&ic->copy_to_journal_wait);
2846         init_completion(&ic->crypto_backoff);
2847         atomic64_set(&ic->number_of_mismatches, 0);
2848
2849         r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ic->dev);
2850         if (r) {
2851                 ti->error = "Device lookup failed";
2852                 goto bad;
2853         }
2854
2855         if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1 || start != (sector_t)start) {
2856                 ti->error = "Invalid starting offset";
2857                 r = -EINVAL;
2858                 goto bad;
2859         }
2860         ic->start = start;
2861
2862         if (strcmp(argv[2], "-")) {
2863                 if (sscanf(argv[2], "%u%c", &ic->tag_size, &dummy) != 1 || !ic->tag_size) {
2864                         ti->error = "Invalid tag size";
2865                         r = -EINVAL;
2866                         goto bad;
2867                 }
2868         }
2869
2870         if (!strcmp(argv[3], "J") || !strcmp(argv[3], "D") || !strcmp(argv[3], "R"))
2871                 ic->mode = argv[3][0];
2872         else {
2873                 ti->error = "Invalid mode (expecting J, D, R)";
2874                 r = -EINVAL;
2875                 goto bad;
2876         }
2877
2878         ic->device_sectors = i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT;
2879         journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS,
2880                         ic->device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR);
2881         interleave_sectors = DEFAULT_INTERLEAVE_SECTORS;
2882         buffer_sectors = DEFAULT_BUFFER_SECTORS;
2883         journal_watermark = DEFAULT_JOURNAL_WATERMARK;
2884         sync_msec = DEFAULT_SYNC_MSEC;
2885         ic->sectors_per_block = 1;
2886
2887         as.argc = argc - DIRECT_ARGUMENTS;
2888         as.argv = argv + DIRECT_ARGUMENTS;
2889         r = dm_read_arg_group(_args, &as, &extra_args, &ti->error);
2890         if (r)
2891                 goto bad;
2892
2893         while (extra_args--) {
2894                 const char *opt_string;
2895                 unsigned val;
2896                 opt_string = dm_shift_arg(&as);
2897                 if (!opt_string) {
2898                         r = -EINVAL;
2899                         ti->error = "Not enough feature arguments";
2900                         goto bad;
2901                 }
2902                 if (sscanf(opt_string, "journal_sectors:%u%c", &val, &dummy) == 1)
2903                         journal_sectors = val;
2904                 else if (sscanf(opt_string, "interleave_sectors:%u%c", &val, &dummy) == 1)
2905                         interleave_sectors = val;
2906                 else if (sscanf(opt_string, "buffer_sectors:%u%c", &val, &dummy) == 1)
2907                         buffer_sectors = val;
2908                 else if (sscanf(opt_string, "journal_watermark:%u%c", &val, &dummy) == 1 && val <= 100)
2909                         journal_watermark = val;
2910                 else if (sscanf(opt_string, "commit_time:%u%c", &val, &dummy) == 1)
2911                         sync_msec = val;
2912                 else if (sscanf(opt_string, "block_size:%u%c", &val, &dummy) == 1) {
2913                         if (val < 1 << SECTOR_SHIFT ||
2914                             val > MAX_SECTORS_PER_BLOCK << SECTOR_SHIFT ||
2915                             (val & (val -1))) {
2916                                 r = -EINVAL;
2917                                 ti->error = "Invalid block_size argument";
2918                                 goto bad;
2919                         }
2920                         ic->sectors_per_block = val >> SECTOR_SHIFT;
2921                 } else if (!strncmp(opt_string, "internal_hash:", strlen("internal_hash:"))) {
2922                         r = get_alg_and_key(opt_string, &ic->internal_hash_alg, &ti->error,
2923                                             "Invalid internal_hash argument");
2924                         if (r)
2925                                 goto bad;
2926                 } else if (!strncmp(opt_string, "journal_crypt:", strlen("journal_crypt:"))) {
2927                         r = get_alg_and_key(opt_string, &ic->journal_crypt_alg, &ti->error,
2928                                             "Invalid journal_crypt argument");
2929                         if (r)
2930                                 goto bad;
2931                 } else if (!strncmp(opt_string, "journal_mac:", strlen("journal_mac:"))) {
2932                         r = get_alg_and_key(opt_string, &ic->journal_mac_alg,  &ti->error,
2933                                             "Invalid journal_mac argument");
2934                         if (r)
2935                                 goto bad;
2936                 } else {
2937                         r = -EINVAL;
2938                         ti->error = "Invalid argument";
2939                         goto bad;
2940                 }
2941         }
2942
2943         r = get_mac(&ic->internal_hash, &ic->internal_hash_alg, &ti->error,
2944                     "Invalid internal hash", "Error setting internal hash key");
2945         if (r)
2946                 goto bad;
2947
2948         r = get_mac(&ic->journal_mac, &ic->journal_mac_alg, &ti->error,
2949                     "Invalid journal mac", "Error setting journal mac key");
2950         if (r)
2951                 goto bad;
2952
2953         if (!ic->tag_size) {
2954                 if (!ic->internal_hash) {
2955                         ti->error = "Unknown tag size";
2956                         r = -EINVAL;
2957                         goto bad;
2958                 }
2959                 ic->tag_size = crypto_shash_digestsize(ic->internal_hash);
2960         }
2961         if (ic->tag_size > MAX_TAG_SIZE) {
2962                 ti->error = "Too big tag size";
2963                 r = -EINVAL;
2964                 goto bad;
2965         }
2966         if (!(ic->tag_size & (ic->tag_size - 1)))
2967                 ic->log2_tag_size = __ffs(ic->tag_size);
2968         else
2969                 ic->log2_tag_size = -1;
2970
2971         ic->autocommit_jiffies = msecs_to_jiffies(sync_msec);
2972         ic->autocommit_msec = sync_msec;
2973         setup_timer(&ic->autocommit_timer, autocommit_fn, (unsigned long)ic);
2974
2975         ic->io = dm_io_client_create();
2976         if (IS_ERR(ic->io)) {
2977                 r = PTR_ERR(ic->io);
2978                 ic->io = NULL;
2979                 ti->error = "Cannot allocate dm io";
2980                 goto bad;
2981         }
2982
2983         ic->journal_io_mempool = mempool_create_slab_pool(JOURNAL_IO_MEMPOOL, journal_io_cache);
2984         if (!ic->journal_io_mempool) {
2985                 r = -ENOMEM;
2986                 ti->error = "Cannot allocate mempool";
2987                 goto bad;
2988         }
2989
2990         ic->metadata_wq = alloc_workqueue("dm-integrity-metadata",
2991                                           WQ_MEM_RECLAIM, METADATA_WORKQUEUE_MAX_ACTIVE);
2992         if (!ic->metadata_wq) {
2993                 ti->error = "Cannot allocate workqueue";
2994                 r = -ENOMEM;
2995                 goto bad;
2996         }
2997
2998         /*
2999          * If this workqueue were percpu, it would cause bio reordering
3000          * and reduced performance.
3001          */
3002         ic->wait_wq = alloc_workqueue("dm-integrity-wait", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
3003         if (!ic->wait_wq) {
3004                 ti->error = "Cannot allocate workqueue";
3005                 r = -ENOMEM;
3006                 goto bad;
3007         }
3008
3009         ic->offload_wq = alloc_workqueue("dm-integrity-offload", WQ_MEM_RECLAIM,
3010                                           METADATA_WORKQUEUE_MAX_ACTIVE);
3011         if (!ic->offload_wq) {
3012                 ti->error = "Cannot allocate workqueue";
3013                 r = -ENOMEM;
3014                 goto bad;
3015         }
3016
3017         ic->commit_wq = alloc_workqueue("dm-integrity-commit", WQ_MEM_RECLAIM, 1);
3018         if (!ic->commit_wq) {
3019                 ti->error = "Cannot allocate workqueue";
3020                 r = -ENOMEM;
3021                 goto bad;
3022         }
3023         INIT_WORK(&ic->commit_work, integrity_commit);
3024
3025         if (ic->mode == 'J') {
3026                 ic->writer_wq = alloc_workqueue("dm-integrity-writer", WQ_MEM_RECLAIM, 1);
3027                 if (!ic->writer_wq) {
3028                         ti->error = "Cannot allocate workqueue";
3029                         r = -ENOMEM;
3030                         goto bad;
3031                 }
3032                 INIT_WORK(&ic->writer_work, integrity_writer);
3033         }
3034
3035         ic->sb = alloc_pages_exact(SB_SECTORS << SECTOR_SHIFT, GFP_KERNEL);
3036         if (!ic->sb) {
3037                 r = -ENOMEM;
3038                 ti->error = "Cannot allocate superblock area";
3039                 goto bad;
3040         }
3041
3042         r = sync_rw_sb(ic, REQ_OP_READ, 0);
3043         if (r) {
3044                 ti->error = "Error reading superblock";
3045                 goto bad;
3046         }
3047         should_write_sb = false;
3048         if (memcmp(ic->sb->magic, SB_MAGIC, 8)) {
3049                 if (ic->mode != 'R') {
3050                         if (memchr_inv(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT)) {
3051                                 r = -EINVAL;
3052                                 ti->error = "The device is not initialized";
3053                                 goto bad;
3054                         }
3055                 }
3056
3057                 r = initialize_superblock(ic, journal_sectors, interleave_sectors);
3058                 if (r) {
3059                         ti->error = "Could not initialize superblock";
3060                         goto bad;
3061                 }
3062                 if (ic->mode != 'R')
3063                         should_write_sb = true;
3064         }
3065
3066         if (ic->sb->version != SB_VERSION) {
3067                 r = -EINVAL;
3068                 ti->error = "Unknown version";
3069                 goto bad;
3070         }
3071         if (le16_to_cpu(ic->sb->integrity_tag_size) != ic->tag_size) {
3072                 r = -EINVAL;
3073                 ti->error = "Tag size doesn't match the information in superblock";
3074                 goto bad;
3075         }
3076         if (ic->sb->log2_sectors_per_block != __ffs(ic->sectors_per_block)) {
3077                 r = -EINVAL;
3078                 ti->error = "Block size doesn't match the information in superblock";
3079                 goto bad;
3080         }
3081         if (!le32_to_cpu(ic->sb->journal_sections)) {
3082                 r = -EINVAL;
3083                 ti->error = "Corrupted superblock, journal_sections is 0";
3084                 goto bad;
3085         }
3086         /* make sure that ti->max_io_len doesn't overflow */
3087         if (ic->sb->log2_interleave_sectors < MIN_LOG2_INTERLEAVE_SECTORS ||
3088             ic->sb->log2_interleave_sectors > MAX_LOG2_INTERLEAVE_SECTORS) {
3089                 r = -EINVAL;
3090                 ti->error = "Invalid interleave_sectors in the superblock";
3091                 goto bad;
3092         }
3093         ic->provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors);
3094         if (ic->provided_data_sectors != le64_to_cpu(ic->sb->provided_data_sectors)) {
3095                 /* test for overflow */
3096                 r = -EINVAL;
3097                 ti->error = "The superblock has 64-bit device size, but the kernel was compiled with 32-bit sectors";
3098                 goto bad;
3099         }
3100         if (!!(ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC)) != !!ic->journal_mac_alg.alg_string) {
3101                 r = -EINVAL;
3102                 ti->error = "Journal mac mismatch";
3103                 goto bad;
3104         }
3105         r = calculate_device_limits(ic);
3106         if (r) {
3107                 ti->error = "The device is too small";
3108                 goto bad;
3109         }
3110         if (ti->len > ic->provided_data_sectors) {
3111                 r = -EINVAL;
3112                 ti->error = "Not enough provided sectors for requested mapping size";
3113                 goto bad;
3114         }
3115
3116         if (!buffer_sectors)
3117                 buffer_sectors = 1;
3118         ic->log2_buffer_sectors = min3((int)__fls(buffer_sectors), (int)__ffs(ic->metadata_run), 31 - SECTOR_SHIFT);
3119
3120         threshold = (__u64)ic->journal_entries * (100 - journal_watermark);
3121         threshold += 50;
3122         do_div(threshold, 100);
3123         ic->free_sectors_threshold = threshold;
3124
3125         DEBUG_print("initialized:\n");
3126         DEBUG_print("   integrity_tag_size %u\n", le16_to_cpu(ic->sb->integrity_tag_size));
3127         DEBUG_print("   journal_entry_size %u\n", ic->journal_entry_size);
3128         DEBUG_print("   journal_entries_per_sector %u\n", ic->journal_entries_per_sector);
3129         DEBUG_print("   journal_section_entries %u\n", ic->journal_section_entries);
3130         DEBUG_print("   journal_section_sectors %u\n", ic->journal_section_sectors);
3131         DEBUG_print("   journal_sections %u\n", (unsigned)le32_to_cpu(ic->sb->journal_sections));
3132         DEBUG_print("   journal_entries %u\n", ic->journal_entries);
3133         DEBUG_print("   log2_interleave_sectors %d\n", ic->sb->log2_interleave_sectors);
3134         DEBUG_print("   device_sectors 0x%llx\n", (unsigned long long)ic->device_sectors);
3135         DEBUG_print("   initial_sectors 0x%x\n", ic->initial_sectors);
3136         DEBUG_print("   metadata_run 0x%x\n", ic->metadata_run);
3137         DEBUG_print("   log2_metadata_run %d\n", ic->log2_metadata_run);
3138         DEBUG_print("   provided_data_sectors 0x%llx (%llu)\n", (unsigned long long)ic->provided_data_sectors,
3139                     (unsigned long long)ic->provided_data_sectors);
3140         DEBUG_print("   log2_buffer_sectors %u\n", ic->log2_buffer_sectors);
3141
3142         ic->bufio = dm_bufio_client_create(ic->dev->bdev, 1U << (SECTOR_SHIFT + ic->log2_buffer_sectors),
3143                                            1, 0, NULL, NULL);
3144         if (IS_ERR(ic->bufio)) {
3145                 r = PTR_ERR(ic->bufio);
3146                 ti->error = "Cannot initialize dm-bufio";
3147                 ic->bufio = NULL;
3148                 goto bad;
3149         }
3150         dm_bufio_set_sector_offset(ic->bufio, ic->start + ic->initial_sectors);
3151
3152         if (ic->mode != 'R') {
3153                 r = create_journal(ic, &ti->error);
3154                 if (r)
3155                         goto bad;
3156         }
3157
3158         if (should_write_sb) {
3159                 init_journal(ic, 0, ic->journal_sections, 0);
3160                 r = dm_integrity_failed(ic);
3161                 if (unlikely(r)) {
3162                         ti->error = "Error initializing journal";
3163                         goto bad;
3164                 }
3165                 r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
3166                 if (r) {
3167                         ti->error = "Error initializing superblock";
3168                         goto bad;
3169                 }
3170                 ic->just_formatted = true;
3171         }
3172
3173         r = dm_set_target_max_io_len(ti, 1U << ic->sb->log2_interleave_sectors);
3174         if (r)
3175                 goto bad;
3176
3177         if (!ic->internal_hash)
3178                 dm_integrity_set(ti, ic);
3179
3180         ti->num_flush_bios = 1;
3181         ti->flush_supported = true;
3182
3183         return 0;
3184 bad:
3185         dm_integrity_dtr(ti);
3186         return r;
3187 }
3188
3189 static void dm_integrity_dtr(struct dm_target *ti)
3190 {
3191         struct dm_integrity_c *ic = ti->private;
3192
3193         BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
3194
3195         if (ic->metadata_wq)
3196                 destroy_workqueue(ic->metadata_wq);
3197         if (ic->wait_wq)
3198                 destroy_workqueue(ic->wait_wq);
3199         if (ic->offload_wq)
3200                 destroy_workqueue(ic->offload_wq);
3201         if (ic->commit_wq)
3202                 destroy_workqueue(ic->commit_wq);
3203         if (ic->writer_wq)
3204                 destroy_workqueue(ic->writer_wq);
3205         if (ic->bufio)
3206                 dm_bufio_client_destroy(ic->bufio);
3207         mempool_destroy(ic->journal_io_mempool);
3208         if (ic->io)
3209                 dm_io_client_destroy(ic->io);
3210         if (ic->dev)
3211                 dm_put_device(ti, ic->dev);
3212         dm_integrity_free_page_list(ic, ic->journal);
3213         dm_integrity_free_page_list(ic, ic->journal_io);
3214         dm_integrity_free_page_list(ic, ic->journal_xor);
3215         if (ic->journal_scatterlist)
3216                 dm_integrity_free_journal_scatterlist(ic, ic->journal_scatterlist);
3217         if (ic->journal_io_scatterlist)
3218                 dm_integrity_free_journal_scatterlist(ic, ic->journal_io_scatterlist);
3219         if (ic->sk_requests) {
3220                 unsigned i;
3221
3222                 for (i = 0; i < ic->journal_sections; i++) {
3223                         struct skcipher_request *req = ic->sk_requests[i];
3224                         if (req) {
3225                                 kzfree(req->iv);
3226                                 skcipher_request_free(req);
3227                         }
3228                 }
3229                 kvfree(ic->sk_requests);
3230         }
3231         kvfree(ic->journal_tree);
3232         if (ic->sb)
3233                 free_pages_exact(ic->sb, SB_SECTORS << SECTOR_SHIFT);
3234
3235         if (ic->internal_hash)
3236                 crypto_free_shash(ic->internal_hash);
3237         free_alg(&ic->internal_hash_alg);
3238
3239         if (ic->journal_crypt)
3240                 crypto_free_skcipher(ic->journal_crypt);
3241         free_alg(&ic->journal_crypt_alg);
3242
3243         if (ic->journal_mac)
3244                 crypto_free_shash(ic->journal_mac);
3245         free_alg(&ic->journal_mac_alg);
3246
3247         kfree(ic);
3248 }
3249
3250 static struct target_type integrity_target = {
3251         .name                   = "integrity",
3252         .version                = {1, 1, 0},
3253         .module                 = THIS_MODULE,
3254         .features               = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
3255         .ctr                    = dm_integrity_ctr,
3256         .dtr                    = dm_integrity_dtr,
3257         .map                    = dm_integrity_map,
3258         .postsuspend            = dm_integrity_postsuspend,
3259         .resume                 = dm_integrity_resume,
3260         .status                 = dm_integrity_status,
3261         .iterate_devices        = dm_integrity_iterate_devices,
3262         .io_hints               = dm_integrity_io_hints,
3263 };
3264
3265 int __init dm_integrity_init(void)
3266 {
3267         int r;
3268
3269         journal_io_cache = kmem_cache_create("integrity_journal_io",
3270                                              sizeof(struct journal_io), 0, 0, NULL);
3271         if (!journal_io_cache) {
3272                 DMERR("can't allocate journal io cache");
3273                 return -ENOMEM;
3274         }
3275
3276         r = dm_register_target(&integrity_target);
3277
3278         if (r < 0)
3279                 DMERR("register failed %d", r);
3280
3281         return r;
3282 }
3283
3284 void dm_integrity_exit(void)
3285 {
3286         dm_unregister_target(&integrity_target);
3287         kmem_cache_destroy(journal_io_cache);
3288 }
3289
3290 module_init(dm_integrity_init);
3291 module_exit(dm_integrity_exit);
3292
3293 MODULE_AUTHOR("Milan Broz");
3294 MODULE_AUTHOR("Mikulas Patocka");
3295 MODULE_DESCRIPTION(DM_NAME " target for integrity tags extension");
3296 MODULE_LICENSE("GPL");