drivers/md/dm.c

   1 /*
   2  * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
   3  * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
   4  *
   5  * This file is released under the GPL.
   6  */
   7
   8 #include "dm-core.h"
   9 #include "dm-rq.h"
  10 #include "dm-uevent.h"
  11
  12 #include <linux/init.h>
  13 #include <linux/module.h>
  14 #include <linux/mutex.h>
  15 #include <linux/sched/mm.h>
  16 #include <linux/sched/signal.h>
  17 #include <linux/blkpg.h>
  18 #include <linux/bio.h>
  19 #include <linux/mempool.h>
  20 #include <linux/dax.h>
  21 #include <linux/slab.h>
  22 #include <linux/idr.h>
  23 #include <linux/uio.h>
  24 #include <linux/hdreg.h>
  25 #include <linux/delay.h>
  26 #include <linux/wait.h>
  27 #include <linux/pr.h>
  28 #include <linux/refcount.h>
  29
  30 #define DM_MSG_PREFIX "core"
  31
  32 /*
  33  * Cookies are numeric values sent with CHANGE and REMOVE
  34  * uevents while resuming, removing or renaming the device.
  35  */
  36 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
  37 #define DM_COOKIE_LENGTH 24
  38
  39 static const char *_name = DM_NAME;
  40
  41 static unsigned int major = 0;
  42 static unsigned int _major = 0;
  43
  44 static DEFINE_IDR(_minor_idr);
  45
  46 static DEFINE_SPINLOCK(_minor_lock);
  47
  48 static void do_deferred_remove(struct work_struct *w);
  49
  50 static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
  51
  52 static struct workqueue_struct *deferred_remove_workqueue;
  53
  54 atomic_t dm_global_event_nr = ATOMIC_INIT(0);
  55 DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
  56
  57 void dm_issue_global_event(void)
  58 {
  59         atomic_inc(&dm_global_event_nr);
  60         wake_up(&dm_global_eventq);
  61 }
  62
  63 /*
  64  * One of these is allocated (on-stack) per original bio.
  65  */
  66 struct clone_info {
  67         struct dm_table *map;
  68         struct bio *bio;
  69         struct dm_io *io;
  70         sector_t sector;
  71         unsigned sector_count;
  72 };
  73
  74 /*
  75  * One of these is allocated per clone bio.
  76  */
  77 #define DM_TIO_MAGIC 7282014
  78 struct dm_target_io {
  79         unsigned magic;
  80         struct dm_io *io;
  81         struct dm_target *ti;
  82         unsigned target_bio_nr;
  83         unsigned *len_ptr;
  84         bool inside_dm_io;
  85         struct bio clone;
  86 };
  87
  88 /*
  89  * One of these is allocated per original bio.
  90  * It contains the first clone used for that original.
  91  */
  92 #define DM_IO_MAGIC 5191977
  93 struct dm_io {
  94         unsigned magic;
  95         struct mapped_device *md;
  96         blk_status_t status;
  97         atomic_t io_count;
  98         struct bio *orig_bio;
  99         unsigned long start_time;
 100         spinlock_t endio_lock;
 101         struct dm_stats_aux stats_aux;
 102         /* last member of dm_target_io is 'struct bio' */
 103         struct dm_target_io tio;
 104 };
 105
 106 void *dm_per_bio_data(struct bio *bio, size_t data_size)
 107 {
 108         struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
 109         if (!tio->inside_dm_io)
 110                 return (char *)bio - offsetof(struct dm_target_io, clone) - data_size;
 111         return (char *)bio - offsetof(struct dm_target_io, clone) - offsetof(struct dm_io, tio) - data_size;
 112 }
 113 EXPORT_SYMBOL_GPL(dm_per_bio_data);
 114
 115 struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
 116 {
 117         struct dm_io *io = (struct dm_io *)((char *)data + data_size);
 118         if (io->magic == DM_IO_MAGIC)
 119                 return (struct bio *)((char *)io + offsetof(struct dm_io, tio) + offsetof(struct dm_target_io, clone));
 120         BUG_ON(io->magic != DM_TIO_MAGIC);
 121         return (struct bio *)((char *)io + offsetof(struct dm_target_io, clone));
 122 }
 123 EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
 124
 125 unsigned dm_bio_get_target_bio_nr(const struct bio *bio)
 126 {
 127         return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
 128 }
 129 EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
 130
 131 #define MINOR_ALLOCED ((void *)-1)
 132
 133 /*
 134  * Bits for the md->flags field.
 135  */
 136 #define DMF_BLOCK_IO_FOR_SUSPEND 0
 137 #define DMF_SUSPENDED 1
 138 #define DMF_FROZEN 2
 139 #define DMF_FREEING 3
 140 #define DMF_DELETING 4
 141 #define DMF_NOFLUSH_SUSPENDING 5
 142 #define DMF_DEFERRED_REMOVE 6
 143 #define DMF_SUSPENDED_INTERNALLY 7
 144 #define DMF_POST_SUSPENDING 8
 145
 146 #define DM_NUMA_NODE NUMA_NO_NODE
 147 static int dm_numa_node = DM_NUMA_NODE;
 148
 149 #define DEFAULT_SWAP_BIOS       (8 * 1048576 / PAGE_SIZE)
 150 static int swap_bios = DEFAULT_SWAP_BIOS;
 151 static int get_swap_bios(void)
 152 {
 153         int latch = READ_ONCE(swap_bios);
 154         if (unlikely(latch <= 0))
 155                 latch = DEFAULT_SWAP_BIOS;
 156         return latch;
 157 }
 158
 159 /*
 160  * For mempools pre-allocation at the table loading time.
 161  */
 162 struct dm_md_mempools {
 163         struct bio_set bs;
 164         struct bio_set io_bs;
 165 };
 166
 167 struct table_device {
 168         struct list_head list;
 169         refcount_t count;
 170         struct dm_dev dm_dev;
 171 };
 172
 173 /*
 174  * Bio-based DM's mempools' reserved IOs set by the user.
 175  */
 176 #define RESERVED_BIO_BASED_IOS          16
 177 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
 178
 179 static int __dm_get_module_param_int(int *module_param, int min, int max)
 180 {
 181         int param = READ_ONCE(*module_param);
 182         int modified_param = 0;
 183         bool modified = true;
 184
 185         if (param < min)
 186                 modified_param = min;
 187         else if (param > max)
 188                 modified_param = max;
 189         else
 190                 modified = false;
 191
 192         if (modified) {
 193                 (void)cmpxchg(module_param, param, modified_param);
 194                 param = modified_param;
 195         }
 196
 197         return param;
 198 }
 199
 200 unsigned __dm_get_module_param(unsigned *module_param,
 201                                unsigned def, unsigned max)
 202 {
 203         unsigned param = READ_ONCE(*module_param);
 204         unsigned modified_param = 0;
 205
 206         if (!param)
 207                 modified_param = def;
 208         else if (param > max)
 209                 modified_param = max;
 210
 211         if (modified_param) {
 212                 (void)cmpxchg(module_param, param, modified_param);
 213                 param = modified_param;
 214         }
 215
 216         return param;
 217 }
 218
 219 unsigned dm_get_reserved_bio_based_ios(void)
 220 {
 221         return __dm_get_module_param(&reserved_bio_based_ios,
 222                                      RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
 223 }
 224 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
 225
 226 static unsigned dm_get_numa_node(void)
 227 {
 228         return __dm_get_module_param_int(&dm_numa_node,
 229                                          DM_NUMA_NODE, num_online_nodes() - 1);
 230 }
 231
 232 static int __init local_init(void)
 233 {
 234         int r;
 235
 236         r = dm_uevent_init();
 237         if (r)
 238                 return r;
 239
 240         deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
 241         if (!deferred_remove_workqueue) {
 242                 r = -ENOMEM;
 243                 goto out_uevent_exit;
 244         }
 245
 246         _major = major;
 247         r = register_blkdev(_major, _name);
 248         if (r < 0)
 249                 goto out_free_workqueue;
 250
 251         if (!_major)
 252                 _major = r;
 253
 254         return 0;
 255
 256 out_free_workqueue:
 257         destroy_workqueue(deferred_remove_workqueue);
 258 out_uevent_exit:
 259         dm_uevent_exit();
 260
 261         return r;
 262 }
 263
 264 static void local_exit(void)
 265 {
 266         destroy_workqueue(deferred_remove_workqueue);
 267
 268         unregister_blkdev(_major, _name);
 269         dm_uevent_exit();
 270
 271         _major = 0;
 272
 273         DMINFO("cleaned up");
 274 }
 275
 276 static int (*_inits[])(void) __initdata = {
 277         local_init,
 278         dm_target_init,
 279         dm_linear_init,
 280         dm_stripe_init,
 281         dm_io_init,
 282         dm_kcopyd_init,
 283         dm_interface_init,
 284         dm_statistics_init,
 285 };
 286
 287 static void (*_exits[])(void) = {
 288         local_exit,
 289         dm_target_exit,
 290         dm_linear_exit,
 291         dm_stripe_exit,
 292         dm_io_exit,
 293         dm_kcopyd_exit,
 294         dm_interface_exit,
 295         dm_statistics_exit,
 296 };
 297
 298 static int __init dm_init(void)
 299 {
 300         const int count = ARRAY_SIZE(_inits);
 301
 302         int r, i;
 303
 304         for (i = 0; i < count; i++) {
 305                 r = _inits[i]();
 306                 if (r)
 307                         goto bad;
 308         }
 309
 310         return 0;
 311
 312       bad:
 313         while (i--)
 314                 _exits[i]();
 315
 316         return r;
 317 }
 318
 319 static void __exit dm_exit(void)
 320 {
 321         int i = ARRAY_SIZE(_exits);
 322
 323         while (i--)
 324                 _exits[i]();
 325
 326         /*
 327          * Should be empty by this point.
 328          */
 329         idr_destroy(&_minor_idr);
 330 }
 331
 332 /*
 333  * Block device functions
 334  */
 335 int dm_deleting_md(struct mapped_device *md)
 336 {
 337         return test_bit(DMF_DELETING, &md->flags);
 338 }
 339
 340 static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 341 {
 342         struct mapped_device *md;
 343
 344         spin_lock(&_minor_lock);
 345
 346         md = bdev->bd_disk->private_data;
 347         if (!md)
 348                 goto out;
 349
 350         if (test_bit(DMF_FREEING, &md->flags) ||
 351             dm_deleting_md(md)) {
 352                 md = NULL;
 353                 goto out;
 354         }
 355
 356         dm_get(md);
 357         atomic_inc(&md->open_count);
 358 out:
 359         spin_unlock(&_minor_lock);
 360
 361         return md ? 0 : -ENXIO;
 362 }
 363
 364 static void dm_blk_close(struct gendisk *disk, fmode_t mode)
 365 {
 366         struct mapped_device *md;
 367
 368         spin_lock(&_minor_lock);
 369
 370         md = disk->private_data;
 371         if (WARN_ON(!md))
 372                 goto out;
 373
 374         if (atomic_dec_and_test(&md->open_count) &&
 375             (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
 376                 queue_work(deferred_remove_workqueue, &deferred_remove_work);
 377
 378         dm_put(md);
 379 out:
 380         spin_unlock(&_minor_lock);
 381 }
 382
 383 int dm_open_count(struct mapped_device *md)
 384 {
 385         return atomic_read(&md->open_count);
 386 }
 387
 388 /*
 389  * Guarantees nothing is using the device before it's deleted.
 390  */
 391 int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
 392 {
 393         int r = 0;
 394
 395         spin_lock(&_minor_lock);
 396
 397         if (dm_open_count(md)) {
 398                 r = -EBUSY;
 399                 if (mark_deferred)
 400                         set_bit(DMF_DEFERRED_REMOVE, &md->flags);
 401         } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
 402                 r = -EEXIST;
 403         else
 404                 set_bit(DMF_DELETING, &md->flags);
 405
 406         spin_unlock(&_minor_lock);
 407
 408         return r;
 409 }
 410
 411 int dm_cancel_deferred_remove(struct mapped_device *md)
 412 {
 413         int r = 0;
 414
 415         spin_lock(&_minor_lock);
 416
 417         if (test_bit(DMF_DELETING, &md->flags))
 418                 r = -EBUSY;
 419         else
 420                 clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
 421
 422         spin_unlock(&_minor_lock);
 423
 424         return r;
 425 }
 426
 427 static void do_deferred_remove(struct work_struct *w)
 428 {
 429         dm_deferred_remove();
 430 }
 431
 432 sector_t dm_get_size(struct mapped_device *md)
 433 {
 434         return get_capacity(md->disk);
 435 }
 436
 437 struct request_queue *dm_get_md_queue(struct mapped_device *md)
 438 {
 439         return md->queue;
 440 }
 441
 442 struct dm_stats *dm_get_stats(struct mapped_device *md)
 443 {
 444         return &md->stats;
 445 }
 446
 447 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 448 {
 449         struct mapped_device *md = bdev->bd_disk->private_data;
 450
 451         return dm_get_geometry(md, geo);
 452 }
 453
 454 static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
 455                                struct blk_zone *zones, unsigned int *nr_zones)
 456 {
 457 #ifdef CONFIG_BLK_DEV_ZONED
 458         struct mapped_device *md = disk->private_data;
 459         struct dm_target *tgt;
 460         struct dm_table *map;
 461         int srcu_idx, ret;
 462
 463         if (dm_suspended_md(md))
 464                 return -EAGAIN;
 465
 466         map = dm_get_live_table(md, &srcu_idx);
 467         if (!map) {
 468                 ret = -EIO;
 469                 goto out;
 470         }
 471
 472         tgt = dm_table_find_target(map, sector);
 473         if (!tgt) {
 474                 ret = -EIO;
 475                 goto out;
 476         }
 477
 478         /*
 479          * If we are executing this, we already know that the block device
 480          * is a zoned device and so each target should have support for that
 481          * type of drive. A missing report_zones method means that the target
 482          * driver has a problem.
 483          */
 484         if (WARN_ON(!tgt->type->report_zones)) {
 485                 ret = -EIO;
 486                 goto out;
 487         }
 488
 489         /*
 490          * blkdev_report_zones() will loop and call this again to cover all the
 491          * zones of the target, eventually moving on to the next target.
 492          * So there is no need to loop here trying to fill the entire array
 493          * of zones.
 494          */
 495         ret = tgt->type->report_zones(tgt, sector, zones, nr_zones);
 496
 497 out:
 498         dm_put_live_table(md, srcu_idx);
 499         return ret;
 500 #else
 501         return -ENOTSUPP;
 502 #endif
 503 }
 504
 505 static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
 506                             struct block_device **bdev)
 507 {
 508         struct dm_target *tgt;
 509         struct dm_table *map;
 510         int r;
 511
 512 retry:
 513         r = -ENOTTY;
 514         map = dm_get_live_table(md, srcu_idx);
 515         if (!map || !dm_table_get_size(map))
 516                 return r;
 517
 518         /* We only support devices that have a single target */
 519         if (dm_table_get_num_targets(map) != 1)
 520                 return r;
 521
 522         tgt = dm_table_get_target(map, 0);
 523         if (!tgt->type->prepare_ioctl)
 524                 return r;
 525
 526         if (dm_suspended_md(md))
 527                 return -EAGAIN;
 528
 529         r = tgt->type->prepare_ioctl(tgt, bdev);
 530         if (r == -ENOTCONN && !fatal_signal_pending(current)) {
 531                 dm_put_live_table(md, *srcu_idx);
 532                 msleep(10);
 533                 goto retry;
 534         }
 535
 536         return r;
 537 }
 538
 539 static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
 540 {
 541         dm_put_live_table(md, srcu_idx);
 542 }
 543
 544 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 545                         unsigned int cmd, unsigned long arg)
 546 {
 547         struct mapped_device *md = bdev->bd_disk->private_data;
 548         int r, srcu_idx;
 549
 550         r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
 551         if (r < 0)
 552                 goto out;
 553
 554         if (r > 0) {
 555                 /*
 556                  * Target determined this ioctl is being issued against a
 557                  * subset of the parent bdev; require extra privileges.
 558                  */
 559                 if (!capable(CAP_SYS_RAWIO)) {
 560                         DMDEBUG_LIMIT(
 561         "%s: sending ioctl %x to DM device without required privilege.",
 562                                 current->comm, cmd);
 563                         r = -ENOIOCTLCMD;
 564                         goto out;
 565                 }
 566         }
 567
 568         r =  __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 569 out:
 570         dm_unprepare_ioctl(md, srcu_idx);
 571         return r;
 572 }
 573
 574 static void start_io_acct(struct dm_io *io);
 575
 576 static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
 577 {
 578         struct dm_io *io;
 579         struct dm_target_io *tio;
 580         struct bio *clone;
 581
 582         clone = bio_alloc_bioset(GFP_NOIO, 0, &md->io_bs);
 583         if (!clone)
 584                 return NULL;
 585
 586         tio = container_of(clone, struct dm_target_io, clone);
 587         tio->inside_dm_io = true;
 588         tio->io = NULL;
 589
 590         io = container_of(tio, struct dm_io, tio);
 591         io->magic = DM_IO_MAGIC;
 592         io->status = 0;
 593         atomic_set(&io->io_count, 1);
 594         io->orig_bio = bio;
 595         io->md = md;
 596         spin_lock_init(&io->endio_lock);
 597
 598         start_io_acct(io);
 599
 600         return io;
 601 }
 602
 603 static void free_io(struct mapped_device *md, struct dm_io *io)
 604 {
 605         bio_put(&io->tio.clone);
 606 }
 607
 608 static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *ti,
 609                                       unsigned target_bio_nr, gfp_t gfp_mask)
 610 {
 611         struct dm_target_io *tio;
 612
 613         if (!ci->io->tio.io) {
 614                 /* the dm_target_io embedded in ci->io is available */
 615                 tio = &ci->io->tio;
 616         } else {
 617                 struct bio *clone = bio_alloc_bioset(gfp_mask, 0, &ci->io->md->bs);
 618                 if (!clone)
 619                         return NULL;
 620
 621                 tio = container_of(clone, struct dm_target_io, clone);
 622                 tio->inside_dm_io = false;
 623         }
 624
 625         tio->magic = DM_TIO_MAGIC;
 626         tio->io = ci->io;
 627         tio->ti = ti;
 628         tio->target_bio_nr = target_bio_nr;
 629
 630         return tio;
 631 }
 632
 633 static void free_tio(struct dm_target_io *tio)
 634 {
 635         if (tio->inside_dm_io)
 636                 return;
 637         bio_put(&tio->clone);
 638 }
 639
 640 static bool md_in_flight_bios(struct mapped_device *md)
 641 {
 642         int cpu;
 643         struct hd_struct *part = &dm_disk(md)->part0;
 644         long sum = 0;
 645
 646         for_each_possible_cpu(cpu) {
 647                 sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
 648                 sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
 649         }
 650
 651         return sum != 0;
 652 }
 653
 654 static bool md_in_flight(struct mapped_device *md)
 655 {
 656         if (queue_is_mq(md->queue))
 657                 return blk_mq_queue_inflight(md->queue);
 658         else
 659                 return md_in_flight_bios(md);
 660 }
 661
 662 static void start_io_acct(struct dm_io *io)
 663 {
 664         struct mapped_device *md = io->md;
 665         struct bio *bio = io->orig_bio;
 666
 667         io->start_time = jiffies;
 668
 669         generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
 670                               &dm_disk(md)->part0);
 671
 672         if (unlikely(dm_stats_used(&md->stats)))
 673                 dm_stats_account_io(&md->stats, bio_data_dir(bio),
 674                                     bio->bi_iter.bi_sector, bio_sectors(bio),
 675                                     false, 0, &io->stats_aux);
 676 }
 677
 678 static void end_io_acct(struct mapped_device *md, struct bio *bio,
 679                         unsigned long start_time, struct dm_stats_aux *stats_aux)
 680 {
 681         unsigned long duration = jiffies - start_time;
 682
 683         if (unlikely(dm_stats_used(&md->stats)))
 684                 dm_stats_account_io(&md->stats, bio_data_dir(bio),
 685                                     bio->bi_iter.bi_sector, bio_sectors(bio),
 686                                     true, duration, stats_aux);
 687
 688         smp_wmb();
 689
 690         generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
 691                             start_time);
 692
 693         /* nudge anyone waiting on suspend queue */
 694         if (unlikely(wq_has_sleeper(&md->wait)))
 695                 wake_up(&md->wait);
 696 }
 697
 698 /*
 699  * Add the bio to the list of deferred io.
 700  */
 701 static void queue_io(struct mapped_device *md, struct bio *bio)
 702 {
 703         unsigned long flags;
 704
 705         spin_lock_irqsave(&md->deferred_lock, flags);
 706         bio_list_add(&md->deferred, bio);
 707         spin_unlock_irqrestore(&md->deferred_lock, flags);
 708         queue_work(md->wq, &md->work);
 709 }
 710
 711 /*
 712  * Everyone (including functions in this file), should use this
 713  * function to access the md->map field, and make sure they call
 714  * dm_put_live_table() when finished.
 715  */
 716 struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
 717 {
 718         *srcu_idx = srcu_read_lock(&md->io_barrier);
 719
 720         return srcu_dereference(md->map, &md->io_barrier);
 721 }
 722
 723 void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
 724 {
 725         srcu_read_unlock(&md->io_barrier, srcu_idx);
 726 }
 727
 728 void dm_sync_table(struct mapped_device *md)
 729 {
 730         synchronize_srcu(&md->io_barrier);
 731         synchronize_rcu_expedited();
 732 }
 733
 734 /*
 735  * A fast alternative to dm_get_live_table/dm_put_live_table.
 736  * The caller must not block between these two functions.
 737  */
 738 static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
 739 {
 740         rcu_read_lock();
 741         return rcu_dereference(md->map);
 742 }
 743
 744 static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
 745 {
 746         rcu_read_unlock();
 747 }
 748
 749 static char *_dm_claim_ptr = "I belong to device-mapper";
 750
 751 /*
 752  * Open a table device so we can use it as a map destination.
 753  */
 754 static int open_table_device(struct table_device *td, dev_t dev,
 755                              struct mapped_device *md)
 756 {
 757         struct block_device *bdev;
 758
 759         int r;
 760
 761         BUG_ON(td->dm_dev.bdev);
 762
 763         bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
 764         if (IS_ERR(bdev))
 765                 return PTR_ERR(bdev);
 766
 767         r = bd_link_disk_holder(bdev, dm_disk(md));
 768         if (r) {
 769                 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
 770                 return r;
 771         }
 772
 773         td->dm_dev.bdev = bdev;
 774         td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
 775         return 0;
 776 }
 777
 778 /*
 779  * Close a table device that we've been using.
 780  */
 781 static void close_table_device(struct table_device *td, struct mapped_device *md)
 782 {
 783         if (!td->dm_dev.bdev)
 784                 return;
 785
 786         bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
 787         blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
 788         put_dax(td->dm_dev.dax_dev);
 789         td->dm_dev.bdev = NULL;
 790         td->dm_dev.dax_dev = NULL;
 791 }
 792
 793 static struct table_device *find_table_device(struct list_head *l, dev_t dev,
 794                                               fmode_t mode)
 795 {
 796         struct table_device *td;
 797
 798         list_for_each_entry(td, l, list)
 799                 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
 800                         return td;
 801
 802         return NULL;
 803 }
 804
 805 int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
 806                         struct dm_dev **result)
 807 {
 808         int r;
 809         struct table_device *td;
 810
 811         mutex_lock(&md->table_devices_lock);
 812         td = find_table_device(&md->table_devices, dev, mode);
 813         if (!td) {
 814                 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
 815                 if (!td) {
 816                         mutex_unlock(&md->table_devices_lock);
 817                         return -ENOMEM;
 818                 }
 819
 820                 td->dm_dev.mode = mode;
 821                 td->dm_dev.bdev = NULL;
 822
 823                 if ((r = open_table_device(td, dev, md))) {
 824                         mutex_unlock(&md->table_devices_lock);
 825                         kfree(td);
 826                         return r;
 827                 }
 828
 829                 format_dev_t(td->dm_dev.name, dev);
 830
 831                 refcount_set(&td->count, 1);
 832                 list_add(&td->list, &md->table_devices);
 833         } else {
 834                 refcount_inc(&td->count);
 835         }
 836         mutex_unlock(&md->table_devices_lock);
 837
 838         *result = &td->dm_dev;
 839         return 0;
 840 }
 841 EXPORT_SYMBOL_GPL(dm_get_table_device);
 842
 843 void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
 844 {
 845         struct table_device *td = container_of(d, struct table_device, dm_dev);
 846
 847         mutex_lock(&md->table_devices_lock);
 848         if (refcount_dec_and_test(&td->count)) {
 849                 close_table_device(td, md);
 850                 list_del(&td->list);
 851                 kfree(td);
 852         }
 853         mutex_unlock(&md->table_devices_lock);
 854 }
 855 EXPORT_SYMBOL(dm_put_table_device);
 856
 857 static void free_table_devices(struct list_head *devices)
 858 {
 859         struct list_head *tmp, *next;
 860
 861         list_for_each_safe(tmp, next, devices) {
 862                 struct table_device *td = list_entry(tmp, struct table_device, list);
 863
 864                 DMWARN("dm_destroy: %s still exists with %d references",
 865                        td->dm_dev.name, refcount_read(&td->count));
 866                 kfree(td);
 867         }
 868 }
 869
 870 /*
 871  * Get the geometry associated with a dm device
 872  */
 873 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
 874 {
 875         *geo = md->geometry;
 876
 877         return 0;
 878 }
 879
 880 /*
 881  * Set the geometry of a device.
 882  */
 883 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
 884 {
 885         sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
 886
 887         if (geo->start > sz) {
 888                 DMWARN("Start sector is beyond the geometry limits.");
 889                 return -EINVAL;
 890         }
 891
 892         md->geometry = *geo;
 893
 894         return 0;
 895 }
 896
 897 static int __noflush_suspending(struct mapped_device *md)
 898 {
 899         return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 900 }
 901
 902 /*
 903  * Decrements the number of outstanding ios that a bio has been
 904  * cloned into, completing the original io if necc.
 905  */
 906 static void dec_pending(struct dm_io *io, blk_status_t error)
 907 {
 908         unsigned long flags;
 909         blk_status_t io_error;
 910         struct bio *bio;
 911         struct mapped_device *md = io->md;
 912         unsigned long start_time = 0;
 913         struct dm_stats_aux stats_aux;
 914
 915         /* Push-back supersedes any I/O errors */
 916         if (unlikely(error)) {
 917                 spin_lock_irqsave(&io->endio_lock, flags);
 918                 if (!(io->status == BLK_STS_DM_REQUEUE && __noflush_suspending(md)))
 919                         io->status = error;
 920                 spin_unlock_irqrestore(&io->endio_lock, flags);
 921         }
 922
 923         if (atomic_dec_and_test(&io->io_count)) {
 924                 if (io->status == BLK_STS_DM_REQUEUE) {
 925                         /*
 926                          * Target requested pushing back the I/O.
 927                          */
 928                         spin_lock_irqsave(&md->deferred_lock, flags);
 929                         if (__noflush_suspending(md))
 930                                 /* NOTE early return due to BLK_STS_DM_REQUEUE below */
 931                                 bio_list_add_head(&md->deferred, io->orig_bio);
 932                         else
 933                                 /* noflush suspend was interrupted. */
 934                                 io->status = BLK_STS_IOERR;
 935                         spin_unlock_irqrestore(&md->deferred_lock, flags);
 936                 }
 937
 938                 io_error = io->status;
 939                 bio = io->orig_bio;
 940                 start_time = io->start_time;
 941                 stats_aux = io->stats_aux;
 942                 free_io(md, io);
 943                 end_io_acct(md, bio, start_time, &stats_aux);
 944
 945                 if (io_error == BLK_STS_DM_REQUEUE)
 946                         return;
 947
 948                 if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
 949                         /*
 950                          * Preflush done for flush with data, reissue
 951                          * without REQ_PREFLUSH.
 952                          */
 953                         bio->bi_opf &= ~REQ_PREFLUSH;
 954                         queue_io(md, bio);
 955                 } else {
 956                         /* done with normal IO or empty flush */
 957                         if (io_error)
 958                                 bio->bi_status = io_error;
 959                         bio_endio(bio);
 960                 }
 961         }
 962 }
 963
 964 void disable_discard(struct mapped_device *md)
 965 {
 966         struct queue_limits *limits = dm_get_queue_limits(md);
 967
 968         /* device doesn't really support DISCARD, disable it */
 969         limits->max_discard_sectors = 0;
 970         blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue);
 971 }
 972
 973 void disable_write_same(struct mapped_device *md)
 974 {
 975         struct queue_limits *limits = dm_get_queue_limits(md);
 976
 977         /* device doesn't really support WRITE SAME, disable it */
 978         limits->max_write_same_sectors = 0;
 979 }
 980
 981 void disable_write_zeroes(struct mapped_device *md)
 982 {
 983         struct queue_limits *limits = dm_get_queue_limits(md);
 984
 985         /* device doesn't really support WRITE ZEROES, disable it */
 986         limits->max_write_zeroes_sectors = 0;
 987 }
 988
 989 static bool swap_bios_limit(struct dm_target *ti, struct bio *bio)
 990 {
 991         return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios);
 992 }
 993
 994 static void clone_endio(struct bio *bio)
 995 {
 996         blk_status_t error = bio->bi_status;
 997         struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
 998         struct dm_io *io = tio->io;
 999         struct mapped_device *md = tio->io->md;
1000         dm_endio_fn endio = tio->ti->type->end_io;
1001
1002         if (unlikely(error == BLK_STS_TARGET)) {
1003                 if (bio_op(bio) == REQ_OP_DISCARD &&
1004                     !bio->bi_disk->queue->limits.max_discard_sectors)
1005                         disable_discard(md);
1006                 else if (bio_op(bio) == REQ_OP_WRITE_SAME &&
1007                          !bio->bi_disk->queue->limits.max_write_same_sectors)
1008                         disable_write_same(md);
1009                 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
1010                          !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
1011                         disable_write_zeroes(md);
1012         }
1013
1014         if (endio) {
1015                 int r = endio(tio->ti, bio, &error);
1016                 switch (r) {
1017                 case DM_ENDIO_REQUEUE:
1018                         error = BLK_STS_DM_REQUEUE;
1019                         /*FALLTHRU*/
1020                 case DM_ENDIO_DONE:
1021                         break;
1022                 case DM_ENDIO_INCOMPLETE:
1023                         /* The target will handle the io */
1024                         return;
1025                 default:
1026                         DMWARN("unimplemented target endio return value: %d", r);
1027                         BUG();
1028                 }
1029         }
1030
1031         if (unlikely(swap_bios_limit(tio->ti, bio))) {
1032                 struct mapped_device *md = io->md;
1033                 up(&md->swap_bios_semaphore);
1034         }
1035
1036         free_tio(tio);
1037         dec_pending(io, error);
1038 }
1039
1040 /*
1041  * Return maximum size of I/O possible at the supplied sector up to the current
1042  * target boundary.
1043  */
1044 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
1045 {
1046         sector_t target_offset = dm_target_offset(ti, sector);
1047
1048         return ti->len - target_offset;
1049 }
1050
1051 static sector_t max_io_len(sector_t sector, struct dm_target *ti)
1052 {
1053         sector_t len = max_io_len_target_boundary(sector, ti);
1054         sector_t offset, max_len;
1055
1056         /*
1057          * Does the target need to split even further?
1058          */
1059         if (ti->max_io_len) {
1060                 offset = dm_target_offset(ti, sector);
1061                 if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
1062                         max_len = sector_div(offset, ti->max_io_len);
1063                 else
1064                         max_len = offset & (ti->max_io_len - 1);
1065                 max_len = ti->max_io_len - max_len;
1066
1067                 if (len > max_len)
1068                         len = max_len;
1069         }
1070
1071         return len;
1072 }
1073
1074 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
1075 {
1076         if (len > UINT_MAX) {
1077                 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
1078                       (unsigned long long)len, UINT_MAX);
1079                 ti->error = "Maximum size of target IO is too large";
1080                 return -EINVAL;
1081         }
1082
1083         ti->max_io_len = (uint32_t) len;
1084
1085         return 0;
1086 }
1087 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1088
1089 static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
1090                                                 sector_t sector, int *srcu_idx)
1091         __acquires(md->io_barrier)
1092 {
1093         struct dm_table *map;
1094         struct dm_target *ti;
1095
1096         map = dm_get_live_table(md, srcu_idx);
1097         if (!map)
1098                 return NULL;
1099
1100         ti = dm_table_find_target(map, sector);
1101         if (!ti)
1102                 return NULL;
1103
1104         return ti;
1105 }
1106
1107 static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
1108                                  long nr_pages, void **kaddr, pfn_t *pfn)
1109 {
1110         struct mapped_device *md = dax_get_private(dax_dev);
1111         sector_t sector = pgoff * PAGE_SECTORS;
1112         struct dm_target *ti;
1113         long len, ret = -EIO;
1114         int srcu_idx;
1115
1116         ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1117
1118         if (!ti)
1119                 goto out;
1120         if (!ti->type->direct_access)
1121                 goto out;
1122         len = max_io_len(sector, ti) / PAGE_SECTORS;
1123         if (len < 1)
1124                 goto out;
1125         nr_pages = min(len, nr_pages);
1126         ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
1127
1128  out:
1129         dm_put_live_table(md, srcu_idx);
1130
1131         return ret;
1132 }
1133
1134 static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
1135                 int blocksize, sector_t start, sector_t len)
1136 {
1137         struct mapped_device *md = dax_get_private(dax_dev);
1138         struct dm_table *map;
1139         bool ret = false;
1140         int srcu_idx;
1141
1142         map = dm_get_live_table(md, &srcu_idx);
1143         if (!map)
1144                 goto out;
1145
1146         ret = dm_table_supports_dax(map, device_not_dax_capable, &blocksize);
1147
1148 out:
1149         dm_put_live_table(md, srcu_idx);
1150
1151         return ret;
1152 }
1153
1154 static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1155                                     void *addr, size_t bytes, struct iov_iter *i)
1156 {
1157         struct mapped_device *md = dax_get_private(dax_dev);
1158         sector_t sector = pgoff * PAGE_SECTORS;
1159         struct dm_target *ti;
1160         long ret = 0;
1161         int srcu_idx;
1162
1163         ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1164
1165         if (!ti)
1166                 goto out;
1167         if (!ti->type->dax_copy_from_iter) {
1168                 ret = copy_from_iter(addr, bytes, i);
1169                 goto out;
1170         }
1171         ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i);
1172  out:
1173         dm_put_live_table(md, srcu_idx);
1174
1175         return ret;
1176 }
1177
1178 static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1179                 void *addr, size_t bytes, struct iov_iter *i)
1180 {
1181         struct mapped_device *md = dax_get_private(dax_dev);
1182         sector_t sector = pgoff * PAGE_SECTORS;
1183         struct dm_target *ti;
1184         long ret = 0;
1185         int srcu_idx;
1186
1187         ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1188
1189         if (!ti)
1190                 goto out;
1191         if (!ti->type->dax_copy_to_iter) {
1192                 ret = copy_to_iter(addr, bytes, i);
1193                 goto out;
1194         }
1195         ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i);
1196  out:
1197         dm_put_live_table(md, srcu_idx);
1198
1199         return ret;
1200 }
1201
1202 /*
1203  * A target may call dm_accept_partial_bio only from the map routine.  It is
1204  * allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET.
1205  *
1206  * dm_accept_partial_bio informs the dm that the target only wants to process
1207  * additional n_sectors sectors of the bio and the rest of the data should be
1208  * sent in a next bio.
1209  *
1210  * A diagram that explains the arithmetics:
1211  * +--------------------+---------------+-------+
1212  * |         1          |       2       |   3   |
1213  * +--------------------+---------------+-------+
1214  *
1215  * <-------------- *tio->len_ptr --------------->
1216  *                      <------- bi_size ------->
1217  *                      <-- n_sectors -->
1218  *
1219  * Region 1 was already iterated over with bio_advance or similar function.
1220  *      (it may be empty if the target doesn't use bio_advance)
1221  * Region 2 is the remaining bio size that the target wants to process.
1222  *      (it may be empty if region 1 is non-empty, although there is no reason
1223  *       to make it empty)
1224  * The target requires that region 3 is to be sent in the next bio.
1225  *
1226  * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
1227  * the partially processed part (the sum of regions 1+2) must be the same for all
1228  * copies of the bio.
1229  */
1230 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1231 {
1232         struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1233         unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1234         BUG_ON(bio->bi_opf & REQ_PREFLUSH);
1235         BUG_ON(bi_size > *tio->len_ptr);
1236         BUG_ON(n_sectors > bi_size);
1237         *tio->len_ptr -= bi_size - n_sectors;
1238         bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1239 }
1240 EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1241
1242 /*
1243  * The zone descriptors obtained with a zone report indicate
1244  * zone positions within the underlying device of the target. The zone
1245  * descriptors must be remapped to match their position within the dm device.
1246  * The caller target should obtain the zones information using
1247  * blkdev_report_zones() to ensure that remapping for partition offset is
1248  * already handled.
1249  */
1250 void dm_remap_zone_report(struct dm_target *ti, sector_t start,
1251                           struct blk_zone *zones, unsigned int *nr_zones)
1252 {
1253 #ifdef CONFIG_BLK_DEV_ZONED
1254         struct blk_zone *zone;
1255         unsigned int nrz = *nr_zones;
1256         int i;
1257
1258         /*
1259          * Remap the start sector and write pointer position of the zones in
1260          * the array. Since we may have obtained from the target underlying
1261          * device more zones that the target size, also adjust the number
1262          * of zones.
1263          */
1264         for (i = 0; i < nrz; i++) {
1265                 zone = zones + i;
1266                 if (zone->start >= start + ti->len) {
1267                         memset(zone, 0, sizeof(struct blk_zone) * (nrz - i));
1268                         break;
1269                 }
1270
1271                 zone->start = zone->start + ti->begin - start;
1272                 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
1273                         continue;
1274
1275                 if (zone->cond == BLK_ZONE_COND_FULL)
1276                         zone->wp = zone->start + zone->len;
1277                 else if (zone->cond == BLK_ZONE_COND_EMPTY)
1278                         zone->wp = zone->start;
1279                 else
1280                         zone->wp = zone->wp + ti->begin - start;
1281         }
1282
1283         *nr_zones = i;
1284 #else /* !CONFIG_BLK_DEV_ZONED */
1285         *nr_zones = 0;
1286 #endif
1287 }
1288 EXPORT_SYMBOL_GPL(dm_remap_zone_report);
1289
1290 static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch)
1291 {
1292         mutex_lock(&md->swap_bios_lock);
1293         while (latch < md->swap_bios) {
1294                 cond_resched();
1295                 down(&md->swap_bios_semaphore);
1296                 md->swap_bios--;
1297         }
1298         while (latch > md->swap_bios) {
1299                 cond_resched();
1300                 up(&md->swap_bios_semaphore);
1301                 md->swap_bios++;
1302         }
1303         mutex_unlock(&md->swap_bios_lock);
1304 }
1305
1306 static blk_qc_t __map_bio(struct dm_target_io *tio)
1307 {
1308         int r;
1309         sector_t sector;
1310         struct bio *clone = &tio->clone;
1311         struct dm_io *io = tio->io;
1312         struct mapped_device *md = io->md;
1313         struct dm_target *ti = tio->ti;
1314         blk_qc_t ret = BLK_QC_T_NONE;
1315
1316         clone->bi_end_io = clone_endio;
1317
1318         /*
1319          * Map the clone.  If r == 0 we don't need to do
1320          * anything, the target has assumed ownership of
1321          * this io.
1322          */
1323         atomic_inc(&io->io_count);
1324         sector = clone->bi_iter.bi_sector;
1325
1326         if (unlikely(swap_bios_limit(ti, clone))) {
1327                 int latch = get_swap_bios();
1328                 if (unlikely(latch != md->swap_bios))
1329                         __set_swap_bios_limit(md, latch);
1330                 down(&md->swap_bios_semaphore);
1331         }
1332
1333         r = ti->type->map(ti, clone);
1334         switch (r) {
1335         case DM_MAPIO_SUBMITTED:
1336                 break;
1337         case DM_MAPIO_REMAPPED:
1338                 /* the bio has been remapped so dispatch it */
1339                 trace_block_bio_remap(clone->bi_disk->queue, clone,
1340                                       bio_dev(io->orig_bio), sector);
1341                 ret = generic_make_request(clone);
1342                 break;
1343         case DM_MAPIO_KILL:
1344                 if (unlikely(swap_bios_limit(ti, clone)))
1345                         up(&md->swap_bios_semaphore);
1346                 free_tio(tio);
1347                 dec_pending(io, BLK_STS_IOERR);
1348                 break;
1349         case DM_MAPIO_REQUEUE:
1350                 if (unlikely(swap_bios_limit(ti, clone)))
1351                         up(&md->swap_bios_semaphore);
1352                 free_tio(tio);
1353                 dec_pending(io, BLK_STS_DM_REQUEUE);
1354                 break;
1355         default:
1356                 DMWARN("unimplemented target map return value: %d", r);
1357                 BUG();
1358         }
1359
1360         return ret;
1361 }
1362
1363 static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
1364 {
1365         bio->bi_iter.bi_sector = sector;
1366         bio->bi_iter.bi_size = to_bytes(len);
1367 }
1368
1369 /*
1370  * Creates a bio that consists of range of complete bvecs.
1371  */
1372 static int clone_bio(struct dm_target_io *tio, struct bio *bio,
1373                      sector_t sector, unsigned len)
1374 {
1375         struct bio *clone = &tio->clone;
1376
1377         __bio_clone_fast(clone, bio);
1378
1379         if (bio_integrity(bio)) {
1380                 int r;
1381
1382                 if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
1383                              !dm_target_passes_integrity(tio->ti->type))) {
1384                         DMWARN("%s: the target %s doesn't support integrity data.",
1385                                 dm_device_name(tio->io->md),
1386                                 tio->ti->type->name);
1387                         return -EIO;
1388                 }
1389
1390                 r = bio_integrity_clone(clone, bio, GFP_NOIO);
1391                 if (r < 0)
1392                         return r;
1393         }
1394
1395         bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1396         clone->bi_iter.bi_size = to_bytes(len);
1397
1398         if (bio_integrity(bio))
1399                 bio_integrity_trim(clone);
1400
1401         return 0;
1402 }
1403
1404 static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
1405                                 struct dm_target *ti, unsigned num_bios)
1406 {
1407         struct dm_target_io *tio;
1408         int try;
1409
1410         if (!num_bios)
1411                 return;
1412
1413         if (num_bios == 1) {
1414                 tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1415                 bio_list_add(blist, &tio->clone);
1416                 return;
1417         }
1418
1419         for (try = 0; try < 2; try++) {
1420                 int bio_nr;
1421                 struct bio *bio;
1422
1423                 if (try)
1424                         mutex_lock(&ci->io->md->table_devices_lock);
1425                 for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
1426                         tio = alloc_tio(ci, ti, bio_nr, try ? GFP_NOIO : GFP_NOWAIT);
1427                         if (!tio)
1428                                 break;
1429
1430                         bio_list_add(blist, &tio->clone);
1431                 }
1432                 if (try)
1433                         mutex_unlock(&ci->io->md->table_devices_lock);
1434                 if (bio_nr == num_bios)
1435                         return;
1436
1437                 while ((bio = bio_list_pop(blist))) {
1438                         tio = container_of(bio, struct dm_target_io, clone);
1439                         free_tio(tio);
1440                 }
1441         }
1442 }
1443
1444 static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci,
1445                                            struct dm_target_io *tio, unsigned *len)
1446 {
1447         struct bio *clone = &tio->clone;
1448
1449         tio->len_ptr = len;
1450
1451         __bio_clone_fast(clone, ci->bio);
1452         if (len)
1453                 bio_setup_sector(clone, ci->sector, *len);
1454
1455         return __map_bio(tio);
1456 }
1457
1458 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1459                                   unsigned num_bios, unsigned *len)
1460 {
1461         struct bio_list blist = BIO_EMPTY_LIST;
1462         struct bio *bio;
1463         struct dm_target_io *tio;
1464
1465         alloc_multiple_bios(&blist, ci, ti, num_bios);
1466
1467         while ((bio = bio_list_pop(&blist))) {
1468                 tio = container_of(bio, struct dm_target_io, clone);
1469                 (void) __clone_and_map_simple_bio(ci, tio, len);
1470         }
1471 }
1472
1473 static int __send_empty_flush(struct clone_info *ci)
1474 {
1475         unsigned target_nr = 0;
1476         struct dm_target *ti;
1477
1478         /*
1479          * Empty flush uses a statically initialized bio, as the base for
1480          * cloning.  However, blkg association requires that a bdev is
1481          * associated with a gendisk, which doesn't happen until the bdev is
1482          * opened.  So, blkg association is done at issue time of the flush
1483          * rather than when the device is created in alloc_dev().
1484          */
1485         bio_set_dev(ci->bio, ci->io->md->bdev);
1486
1487         BUG_ON(bio_has_data(ci->bio));
1488         while ((ti = dm_table_get_target(ci->map, target_nr++)))
1489                 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1490         return 0;
1491 }
1492
1493 static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1494                                     sector_t sector, unsigned *len)
1495 {
1496         struct bio *bio = ci->bio;
1497         struct dm_target_io *tio;
1498         int r;
1499
1500         tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1501         tio->len_ptr = len;
1502         r = clone_bio(tio, bio, sector, *len);
1503         if (r < 0) {
1504                 free_tio(tio);
1505                 return r;
1506         }
1507         (void) __map_bio(tio);
1508
1509         return 0;
1510 }
1511
1512 typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1513
1514 static unsigned get_num_discard_bios(struct dm_target *ti)
1515 {
1516         return ti->num_discard_bios;
1517 }
1518
1519 static unsigned get_num_secure_erase_bios(struct dm_target *ti)
1520 {
1521         return ti->num_secure_erase_bios;
1522 }
1523
1524 static unsigned get_num_write_same_bios(struct dm_target *ti)
1525 {
1526         return ti->num_write_same_bios;
1527 }
1528
1529 static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
1530 {
1531         return ti->num_write_zeroes_bios;
1532 }
1533
1534 static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
1535                                        unsigned num_bios)
1536 {
1537         unsigned len;
1538
1539         /*
1540          * Even though the device advertised support for this type of
1541          * request, that does not mean every target supports it, and
1542          * reconfiguration might also have changed that since the
1543          * check was performed.
1544          */
1545         if (!num_bios)
1546                 return -EOPNOTSUPP;
1547
1548         len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1549
1550         __send_duplicate_bios(ci, ti, num_bios, &len);
1551
1552         ci->sector += len;
1553         ci->sector_count -= len;
1554
1555         return 0;
1556 }
1557
1558 static int __send_discard(struct clone_info *ci, struct dm_target *ti)
1559 {
1560         return __send_changing_extent_only(ci, ti, get_num_discard_bios(ti));
1561 }
1562
1563 static int __send_secure_erase(struct clone_info *ci, struct dm_target *ti)
1564 {
1565         return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios(ti));
1566 }
1567
1568 static int __send_write_same(struct clone_info *ci, struct dm_target *ti)
1569 {
1570         return __send_changing_extent_only(ci, ti, get_num_write_same_bios(ti));
1571 }
1572
1573 static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti)
1574 {
1575         return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios(ti));
1576 }
1577
1578 static bool is_abnormal_io(struct bio *bio)
1579 {
1580         bool r = false;
1581
1582         switch (bio_op(bio)) {
1583         case REQ_OP_DISCARD:
1584         case REQ_OP_SECURE_ERASE:
1585         case REQ_OP_WRITE_SAME:
1586         case REQ_OP_WRITE_ZEROES:
1587                 r = true;
1588                 break;
1589         }
1590
1591         return r;
1592 }
1593
1594 static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
1595                                   int *result)
1596 {
1597         struct bio *bio = ci->bio;
1598
1599         if (bio_op(bio) == REQ_OP_DISCARD)
1600                 *result = __send_discard(ci, ti);
1601         else if (bio_op(bio) == REQ_OP_SECURE_ERASE)
1602                 *result = __send_secure_erase(ci, ti);
1603         else if (bio_op(bio) == REQ_OP_WRITE_SAME)
1604                 *result = __send_write_same(ci, ti);
1605         else if (bio_op(bio) == REQ_OP_WRITE_ZEROES)
1606                 *result = __send_write_zeroes(ci, ti);
1607         else
1608                 return false;
1609
1610         return true;
1611 }
1612
1613 /*
1614  * Select the correct strategy for processing a non-flush bio.
1615  */
1616 static int __split_and_process_non_flush(struct clone_info *ci)
1617 {
1618         struct dm_target *ti;
1619         unsigned len;
1620         int r;
1621
1622         ti = dm_table_find_target(ci->map, ci->sector);
1623         if (!ti)
1624                 return -EIO;
1625
1626         if (__process_abnormal_io(ci, ti, &r))
1627                 return r;
1628
1629         len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
1630
1631         r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
1632         if (r < 0)
1633                 return r;
1634
1635         ci->sector += len;
1636         ci->sector_count -= len;
1637
1638         return 0;
1639 }
1640
1641 static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
1642                             struct dm_table *map, struct bio *bio)
1643 {
1644         ci->map = map;
1645         ci->io = alloc_io(md, bio);
1646         ci->sector = bio->bi_iter.bi_sector;
1647 }
1648
1649 #define __dm_part_stat_sub(part, field, subnd)  \
1650         (part_stat_get(part, field) -= (subnd))
1651
1652 /*
1653  * Entry point to split a bio into clones and submit them to the targets.
1654  */
1655 static blk_qc_t __split_and_process_bio(struct mapped_device *md,
1656                                         struct dm_table *map, struct bio *bio)
1657 {
1658         struct clone_info ci;
1659         blk_qc_t ret = BLK_QC_T_NONE;
1660         int error = 0;
1661
1662         init_clone_info(&ci, md, map, bio);
1663
1664         if (bio->bi_opf & REQ_PREFLUSH) {
1665                 struct bio flush_bio;
1666
1667                 /*
1668                  * Use an on-stack bio for this, it's safe since we don't
1669                  * need to reference it after submit. It's just used as
1670                  * the basis for the clone(s).
1671                  */
1672                 bio_init(&flush_bio, NULL, 0);
1673                 flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1674                 ci.bio = &flush_bio;
1675                 ci.sector_count = 0;
1676                 error = __send_empty_flush(&ci);
1677                 bio_uninit(ci.bio);
1678                 /* dec_pending submits any data associated with flush */
1679         } else if (bio_op(bio) == REQ_OP_ZONE_RESET) {
1680                 ci.bio = bio;
1681                 ci.sector_count = 0;
1682                 error = __split_and_process_non_flush(&ci);
1683         } else {
1684                 ci.bio = bio;
1685                 ci.sector_count = bio_sectors(bio);
1686                 while (ci.sector_count && !error) {
1687                         error = __split_and_process_non_flush(&ci);
1688                         if (current->bio_list && ci.sector_count && !error) {
1689                                 /*
1690                                  * Remainder must be passed to generic_make_request()
1691                                  * so that it gets handled *after* bios already submitted
1692                                  * have been completely processed.
1693                                  * We take a clone of the original to store in
1694                                  * ci.io->orig_bio to be used by end_io_acct() and
1695                                  * for dec_pending to use for completion handling.
1696                                  */
1697                                 struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
1698                                                           GFP_NOIO, &md->queue->bio_split);
1699                                 ci.io->orig_bio = b;
1700
1701                                 /*
1702                                  * Adjust IO stats for each split, otherwise upon queue
1703                                  * reentry there will be redundant IO accounting.
1704                                  * NOTE: this is a stop-gap fix, a proper fix involves
1705                                  * significant refactoring of DM core's bio splitting
1706                                  * (by eliminating DM's splitting and just using bio_split)
1707                                  */
1708                                 part_stat_lock();
1709                                 __dm_part_stat_sub(&dm_disk(md)->part0,
1710                                                    sectors[op_stat_group(bio_op(bio))], ci.sector_count);
1711                                 part_stat_unlock();
1712
1713                                 bio_chain(b, bio);
1714                                 trace_block_split(md->queue, b, bio->bi_iter.bi_sector);
1715                                 ret = generic_make_request(bio);
1716                                 break;
1717                         }
1718                 }
1719         }
1720
1721         /* drop the extra reference count */
1722         dec_pending(ci.io, errno_to_blk_status(error));
1723         return ret;
1724 }
1725
1726 static blk_qc_t dm_process_bio(struct mapped_device *md,
1727                                struct dm_table *map, struct bio *bio)
1728 {
1729         blk_qc_t ret = BLK_QC_T_NONE;
1730         struct dm_target *ti = md->immutable_target;
1731
1732         if (unlikely(!map)) {
1733                 bio_io_error(bio);
1734                 return ret;
1735         }
1736
1737         if (!ti) {
1738                 ti = dm_table_find_target(map, bio->bi_iter.bi_sector);
1739                 if (unlikely(!ti)) {
1740                         bio_io_error(bio);
1741                         return ret;
1742                 }
1743         }
1744
1745         /*
1746          * If in ->make_request_fn we need to use blk_queue_split(), otherwise
1747          * queue_limits for abnormal requests (e.g. discard, writesame, etc)
1748          * won't be imposed.
1749          */
1750         if (current->bio_list) {
1751                 if (is_abnormal_io(bio))
1752                         blk_queue_split(md->queue, &bio);
1753                 /* regular IO is split by __split_and_process_bio */
1754         }
1755
1756         return __split_and_process_bio(md, map, bio);
1757 }
1758
1759 static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
1760 {
1761         struct mapped_device *md = q->queuedata;
1762         blk_qc_t ret = BLK_QC_T_NONE;
1763         int srcu_idx;
1764         struct dm_table *map;
1765
1766         map = dm_get_live_table(md, &srcu_idx);
1767
1768         /* if we're suspended, we have to queue this io for later */
1769         if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1770                 dm_put_live_table(md, srcu_idx);
1771
1772                 if (!(bio->bi_opf & REQ_RAHEAD))
1773                         queue_io(md, bio);
1774                 else
1775                         bio_io_error(bio);
1776                 return ret;
1777         }
1778
1779         ret = dm_process_bio(md, map, bio);
1780
1781         dm_put_live_table(md, srcu_idx);
1782         return ret;
1783 }
1784
1785 static int dm_any_congested(void *congested_data, int bdi_bits)
1786 {
1787         int r = bdi_bits;
1788         struct mapped_device *md = congested_data;
1789         struct dm_table *map;
1790
1791         if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1792                 if (dm_request_based(md)) {
1793                         /*
1794                          * With request-based DM we only need to check the
1795                          * top-level queue for congestion.
1796                          */
1797                         struct backing_dev_info *bdi = md->queue->backing_dev_info;
1798                         r = bdi->wb.congested->state & bdi_bits;
1799                 } else {
1800                         map = dm_get_live_table_fast(md);
1801                         if (map)
1802                                 r = dm_table_any_congested(map, bdi_bits);
1803                         dm_put_live_table_fast(md);
1804                 }
1805         }
1806
1807         return r;
1808 }
1809
1810 /*-----------------------------------------------------------------
1811  * An IDR is used to keep track of allocated minor numbers.
1812  *---------------------------------------------------------------*/
1813 static void free_minor(int minor)
1814 {
1815         spin_lock(&_minor_lock);
1816         idr_remove(&_minor_idr, minor);
1817         spin_unlock(&_minor_lock);
1818 }
1819
1820 /*
1821  * See if the device with a specific minor # is free.
1822  */
1823 static int specific_minor(int minor)
1824 {
1825         int r;
1826
1827         if (minor >= (1 << MINORBITS))
1828                 return -EINVAL;
1829
1830         idr_preload(GFP_KERNEL);
1831         spin_lock(&_minor_lock);
1832
1833         r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1834
1835         spin_unlock(&_minor_lock);
1836         idr_preload_end();
1837         if (r < 0)
1838                 return r == -ENOSPC ? -EBUSY : r;
1839         return 0;
1840 }
1841
1842 static int next_free_minor(int *minor)
1843 {
1844         int r;
1845
1846         idr_preload(GFP_KERNEL);
1847         spin_lock(&_minor_lock);
1848
1849         r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1850
1851         spin_unlock(&_minor_lock);
1852         idr_preload_end();
1853         if (r < 0)
1854                 return r;
1855         *minor = r;
1856         return 0;
1857 }
1858
1859 static const struct block_device_operations dm_blk_dops;
1860 static const struct dax_operations dm_dax_ops;
1861
1862 static void dm_wq_work(struct work_struct *work);
1863
1864 static void cleanup_mapped_device(struct mapped_device *md)
1865 {
1866         if (md->wq)
1867                 destroy_workqueue(md->wq);
1868         bioset_exit(&md->bs);
1869         bioset_exit(&md->io_bs);
1870
1871         if (md->dax_dev) {
1872                 kill_dax(md->dax_dev);
1873                 put_dax(md->dax_dev);
1874                 md->dax_dev = NULL;
1875         }
1876
1877         if (md->disk) {
1878                 spin_lock(&_minor_lock);
1879                 md->disk->private_data = NULL;
1880                 spin_unlock(&_minor_lock);
1881                 del_gendisk(md->disk);
1882                 put_disk(md->disk);
1883         }
1884
1885         if (md->queue)
1886                 blk_cleanup_queue(md->queue);
1887
1888         cleanup_srcu_struct(&md->io_barrier);
1889
1890         if (md->bdev) {
1891                 bdput(md->bdev);
1892                 md->bdev = NULL;
1893         }
1894
1895         mutex_destroy(&md->suspend_lock);
1896         mutex_destroy(&md->type_lock);
1897         mutex_destroy(&md->table_devices_lock);
1898         mutex_destroy(&md->swap_bios_lock);
1899
1900         dm_mq_cleanup_mapped_device(md);
1901 }
1902
1903 /*
1904  * Allocate and initialise a blank device with a given minor.
1905  */
1906 static struct mapped_device *alloc_dev(int minor)
1907 {
1908         int r, numa_node_id = dm_get_numa_node();
1909         struct mapped_device *md;
1910         void *old_md;
1911
1912         md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1913         if (!md) {
1914                 DMWARN("unable to allocate device, out of memory.");
1915                 return NULL;
1916         }
1917
1918         if (!try_module_get(THIS_MODULE))
1919                 goto bad_module_get;
1920
1921         /* get a minor number for the dev */
1922         if (minor == DM_ANY_MINOR)
1923                 r = next_free_minor(&minor);
1924         else
1925                 r = specific_minor(minor);
1926         if (r < 0)
1927                 goto bad_minor;
1928
1929         r = init_srcu_struct(&md->io_barrier);
1930         if (r < 0)
1931                 goto bad_io_barrier;
1932
1933         md->numa_node_id = numa_node_id;
1934         md->init_tio_pdu = false;
1935         md->type = DM_TYPE_NONE;
1936         mutex_init(&md->suspend_lock);
1937         mutex_init(&md->type_lock);
1938         mutex_init(&md->table_devices_lock);
1939         spin_lock_init(&md->deferred_lock);
1940         atomic_set(&md->holders, 1);
1941         atomic_set(&md->open_count, 0);
1942         atomic_set(&md->event_nr, 0);
1943         atomic_set(&md->uevent_seq, 0);
1944         INIT_LIST_HEAD(&md->uevent_list);
1945         INIT_LIST_HEAD(&md->table_devices);
1946         spin_lock_init(&md->uevent_lock);
1947
1948         md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
1949         if (!md->queue)
1950                 goto bad;
1951         md->queue->queuedata = md;
1952         /*
1953          * default to bio-based required ->make_request_fn until DM
1954          * table is loaded and md->type established. If request-based
1955          * table is loaded: blk-mq will override accordingly.
1956          */
1957         blk_queue_make_request(md->queue, dm_make_request);
1958
1959         md->disk = alloc_disk_node(1, md->numa_node_id);
1960         if (!md->disk)
1961                 goto bad;
1962
1963         init_waitqueue_head(&md->wait);
1964         INIT_WORK(&md->work, dm_wq_work);
1965         init_waitqueue_head(&md->eventq);
1966         init_completion(&md->kobj_holder.completion);
1967
1968         md->swap_bios = get_swap_bios();
1969         sema_init(&md->swap_bios_semaphore, md->swap_bios);
1970         mutex_init(&md->swap_bios_lock);
1971
1972         md->disk->major = _major;
1973         md->disk->first_minor = minor;
1974         md->disk->fops = &dm_blk_dops;
1975         md->disk->queue = md->queue;
1976         md->disk->private_data = md;
1977         sprintf(md->disk->disk_name, "dm-%d", minor);
1978
1979         if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
1980                 md->dax_dev = alloc_dax(md, md->disk->disk_name,
1981                                         &dm_dax_ops, 0);
1982                 if (!md->dax_dev)
1983                         goto bad;
1984         }
1985
1986         add_disk_no_queue_reg(md->disk);
1987         format_dev_t(md->name, MKDEV(_major, minor));
1988
1989         md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
1990         if (!md->wq)
1991                 goto bad;
1992
1993         md->bdev = bdget_disk(md->disk, 0);
1994         if (!md->bdev)
1995                 goto bad;
1996
1997         r = dm_stats_init(&md->stats);
1998         if (r < 0)
1999                 goto bad;
2000
2001         /* Populate the mapping, nobody knows we exist yet */
2002         spin_lock(&_minor_lock);
2003         old_md = idr_replace(&_minor_idr, md, minor);
2004         spin_unlock(&_minor_lock);
2005
2006         BUG_ON(old_md != MINOR_ALLOCED);
2007
2008         return md;
2009
2010 bad:
2011         cleanup_mapped_device(md);
2012 bad_io_barrier:
2013         free_minor(minor);
2014 bad_minor:
2015         module_put(THIS_MODULE);
2016 bad_module_get:
2017         kvfree(md);
2018         return NULL;
2019 }
2020
2021 static void unlock_fs(struct mapped_device *md);
2022
2023 static void free_dev(struct mapped_device *md)
2024 {
2025         int minor = MINOR(disk_devt(md->disk));
2026
2027         unlock_fs(md);
2028
2029         cleanup_mapped_device(md);
2030
2031         free_table_devices(&md->table_devices);
2032         dm_stats_cleanup(&md->stats);
2033         free_minor(minor);
2034
2035         module_put(THIS_MODULE);
2036         kvfree(md);
2037 }
2038
2039 static int __bind_mempools(struct mapped_device *md, struct dm_table *t)
2040 {
2041         struct dm_md_mempools *p = dm_table_get_md_mempools(t);
2042         int ret = 0;
2043
2044         if (dm_table_bio_based(t)) {
2045                 /*
2046                  * The md may already have mempools that need changing.
2047                  * If so, reload bioset because front_pad may have changed
2048                  * because a different table was loaded.
2049                  */
2050                 bioset_exit(&md->bs);
2051                 bioset_exit(&md->io_bs);
2052
2053         } else if (bioset_initialized(&md->bs)) {
2054                 /*
2055                  * There's no need to reload with request-based dm
2056                  * because the size of front_pad doesn't change.
2057                  * Note for future: If you are to reload bioset,
2058                  * prep-ed requests in the queue may refer
2059                  * to bio from the old bioset, so you must walk
2060                  * through the queue to unprep.
2061                  */
2062                 goto out;
2063         }
2064
2065         BUG_ON(!p ||
2066                bioset_initialized(&md->bs) ||
2067                bioset_initialized(&md->io_bs));
2068
2069         ret = bioset_init_from_src(&md->bs, &p->bs);
2070         if (ret)
2071                 goto out;
2072         ret = bioset_init_from_src(&md->io_bs, &p->io_bs);
2073         if (ret)
2074                 bioset_exit(&md->bs);
2075 out:
2076         /* mempool bind completed, no longer need any mempools in the table */
2077         dm_table_free_md_mempools(t);
2078         return ret;
2079 }
2080
2081 /*
2082  * Bind a table to the device.
2083  */
2084 static void event_callback(void *context)
2085 {
2086         unsigned long flags;
2087         LIST_HEAD(uevents);
2088         struct mapped_device *md = (struct mapped_device *) context;
2089
2090         spin_lock_irqsave(&md->uevent_lock, flags);
2091         list_splice_init(&md->uevent_list, &uevents);
2092         spin_unlock_irqrestore(&md->uevent_lock, flags);
2093
2094         dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
2095
2096         atomic_inc(&md->event_nr);
2097         wake_up(&md->eventq);
2098         dm_issue_global_event();
2099 }
2100
2101 /*
2102  * Protected by md->suspend_lock obtained by dm_swap_table().
2103  */
2104 static void __set_size(struct mapped_device *md, sector_t size)
2105 {
2106         lockdep_assert_held(&md->suspend_lock);
2107
2108         set_capacity(md->disk, size);
2109
2110         i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
2111 }
2112
2113 /*
2114  * Returns old map, which caller must destroy.
2115  */
2116 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2117                                struct queue_limits *limits)
2118 {
2119         struct dm_table *old_map;
2120         struct request_queue *q = md->queue;
2121         bool request_based = dm_table_request_based(t);
2122         sector_t size;
2123         int ret;
2124
2125         lockdep_assert_held(&md->suspend_lock);
2126
2127         size = dm_table_get_size(t);
2128
2129         /*
2130          * Wipe any geometry if the size of the table changed.
2131          */
2132         if (size != dm_get_size(md))
2133                 memset(&md->geometry, 0, sizeof(md->geometry));
2134
2135         __set_size(md, size);
2136
2137         dm_table_event_callback(t, event_callback, md);
2138
2139         /*
2140          * The queue hasn't been stopped yet, if the old table type wasn't
2141          * for request-based during suspension.  So stop it to prevent
2142          * I/O mapping before resume.
2143          * This must be done before setting the queue restrictions,
2144          * because request-based dm may be run just after the setting.
2145          */
2146         if (request_based)
2147                 dm_stop_queue(q);
2148
2149         if (request_based) {
2150                 /*
2151                  * Leverage the fact that request-based DM targets are
2152                  * immutable singletons - used to optimize dm_mq_queue_rq.
2153                  */
2154                 md->immutable_target = dm_table_get_immutable_target(t);
2155         }
2156
2157         ret = __bind_mempools(md, t);
2158         if (ret) {
2159                 old_map = ERR_PTR(ret);
2160                 goto out;
2161         }
2162
2163         old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2164         rcu_assign_pointer(md->map, (void *)t);
2165         md->immutable_target_type = dm_table_get_immutable_target_type(t);
2166
2167         dm_table_set_restrictions(t, q, limits);
2168         if (old_map)
2169                 dm_sync_table(md);
2170
2171 out:
2172         return old_map;
2173 }
2174
2175 /*
2176  * Returns unbound table for the caller to free.
2177  */
2178 static struct dm_table *__unbind(struct mapped_device *md)
2179 {
2180         struct dm_table *map = rcu_dereference_protected(md->map, 1);
2181
2182         if (!map)
2183                 return NULL;
2184
2185         dm_table_event_callback(map, NULL, NULL);
2186         RCU_INIT_POINTER(md->map, NULL);
2187         dm_sync_table(md);
2188
2189         return map;
2190 }
2191
2192 /*
2193  * Constructor for a new device.
2194  */
2195 int dm_create(int minor, struct mapped_device **result)
2196 {
2197         int r;
2198         struct mapped_device *md;
2199
2200         md = alloc_dev(minor);
2201         if (!md)
2202                 return -ENXIO;
2203
2204         r = dm_sysfs_init(md);
2205         if (r) {
2206                 free_dev(md);
2207                 return r;
2208         }
2209
2210         *result = md;
2211         return 0;
2212 }
2213
2214 /*
2215  * Functions to manage md->type.
2216  * All are required to hold md->type_lock.
2217  */
2218 void dm_lock_md_type(struct mapped_device *md)
2219 {
2220         mutex_lock(&md->type_lock);
2221 }
2222
2223 void dm_unlock_md_type(struct mapped_device *md)
2224 {
2225         mutex_unlock(&md->type_lock);
2226 }
2227
2228 void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
2229 {
2230         BUG_ON(!mutex_is_locked(&md->type_lock));
2231         md->type = type;
2232 }
2233
2234 enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
2235 {
2236         return md->type;
2237 }
2238
2239 struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2240 {
2241         return md->immutable_target_type;
2242 }
2243
2244 /*
2245  * The queue_limits are only valid as long as you have a reference
2246  * count on 'md'.
2247  */
2248 struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2249 {
2250         BUG_ON(!atomic_read(&md->holders));
2251         return &md->queue->limits;
2252 }
2253 EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2254
2255 static void dm_init_congested_fn(struct mapped_device *md)
2256 {
2257         md->queue->backing_dev_info->congested_data = md;
2258         md->queue->backing_dev_info->congested_fn = dm_any_congested;
2259 }
2260
2261 /*
2262  * Setup the DM device's queue based on md's type
2263  */
2264 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2265 {
2266         int r;
2267         struct queue_limits limits;
2268         enum dm_queue_mode type = dm_get_md_type(md);
2269
2270         switch (type) {
2271         case DM_TYPE_REQUEST_BASED:
2272                 r = dm_mq_init_request_queue(md, t);
2273                 if (r) {
2274                         DMERR("Cannot initialize queue for request-based dm-mq mapped device");
2275                         return r;
2276                 }
2277                 dm_init_congested_fn(md);
2278                 break;
2279         case DM_TYPE_BIO_BASED:
2280         case DM_TYPE_DAX_BIO_BASED:
2281                 dm_init_congested_fn(md);
2282                 break;
2283         case DM_TYPE_NONE:
2284                 WARN_ON_ONCE(true);
2285                 break;
2286         }
2287
2288         r = dm_calculate_queue_limits(t, &limits);
2289         if (r) {
2290                 DMERR("Cannot calculate initial queue limits");
2291                 return r;
2292         }
2293         dm_table_set_restrictions(t, md->queue, &limits);
2294         blk_register_queue(md->disk);
2295
2296         return 0;
2297 }
2298
2299 struct mapped_device *dm_get_md(dev_t dev)
2300 {
2301         struct mapped_device *md;
2302         unsigned minor = MINOR(dev);
2303
2304         if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2305                 return NULL;
2306
2307         spin_lock(&_minor_lock);
2308
2309         md = idr_find(&_minor_idr, minor);
2310         if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
2311             test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2312                 md = NULL;
2313                 goto out;
2314         }
2315         dm_get(md);
2316 out:
2317         spin_unlock(&_minor_lock);
2318
2319         return md;
2320 }
2321 EXPORT_SYMBOL_GPL(dm_get_md);
2322
2323 void *dm_get_mdptr(struct mapped_device *md)
2324 {
2325         return md->interface_ptr;
2326 }
2327
2328 void dm_set_mdptr(struct mapped_device *md, void *ptr)
2329 {
2330         md->interface_ptr = ptr;
2331 }
2332
2333 void dm_get(struct mapped_device *md)
2334 {
2335         atomic_inc(&md->holders);
2336         BUG_ON(test_bit(DMF_FREEING, &md->flags));
2337 }
2338
2339 int dm_hold(struct mapped_device *md)
2340 {
2341         spin_lock(&_minor_lock);
2342         if (test_bit(DMF_FREEING, &md->flags)) {
2343                 spin_unlock(&_minor_lock);
2344                 return -EBUSY;
2345         }
2346         dm_get(md);
2347         spin_unlock(&_minor_lock);
2348         return 0;
2349 }
2350 EXPORT_SYMBOL_GPL(dm_hold);
2351
2352 const char *dm_device_name(struct mapped_device *md)
2353 {
2354         return md->name;
2355 }
2356 EXPORT_SYMBOL_GPL(dm_device_name);
2357
2358 static void __dm_destroy(struct mapped_device *md, bool wait)
2359 {
2360         struct dm_table *map;
2361         int srcu_idx;
2362
2363         might_sleep();
2364
2365         spin_lock(&_minor_lock);
2366         idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2367         set_bit(DMF_FREEING, &md->flags);
2368         spin_unlock(&_minor_lock);
2369
2370         blk_set_queue_dying(md->queue);
2371
2372         /*
2373          * Take suspend_lock so that presuspend and postsuspend methods
2374          * do not race with internal suspend.
2375          */
2376         mutex_lock(&md->suspend_lock);
2377         map = dm_get_live_table(md, &srcu_idx);
2378         if (!dm_suspended_md(md)) {
2379                 dm_table_presuspend_targets(map);
2380                 set_bit(DMF_SUSPENDED, &md->flags);
2381                 set_bit(DMF_POST_SUSPENDING, &md->flags);
2382                 dm_table_postsuspend_targets(map);
2383         }
2384         /* dm_put_live_table must be before msleep, otherwise deadlock is possible */
2385         dm_put_live_table(md, srcu_idx);
2386         mutex_unlock(&md->suspend_lock);
2387
2388         /*
2389          * Rare, but there may be I/O requests still going to complete,
2390          * for example.  Wait for all references to disappear.
2391          * No one should increment the reference count of the mapped_device,
2392          * after the mapped_device state becomes DMF_FREEING.
2393          */
2394         if (wait)
2395                 while (atomic_read(&md->holders))
2396                         msleep(1);
2397         else if (atomic_read(&md->holders))
2398                 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2399                        dm_device_name(md), atomic_read(&md->holders));
2400
2401         dm_sysfs_exit(md);
2402         dm_table_destroy(__unbind(md));
2403         free_dev(md);
2404 }
2405
2406 void dm_destroy(struct mapped_device *md)
2407 {
2408         __dm_destroy(md, true);
2409 }
2410
2411 void dm_destroy_immediate(struct mapped_device *md)
2412 {
2413         __dm_destroy(md, false);
2414 }
2415
2416 void dm_put(struct mapped_device *md)
2417 {
2418         atomic_dec(&md->holders);
2419 }
2420 EXPORT_SYMBOL_GPL(dm_put);
2421
2422 static int dm_wait_for_completion(struct mapped_device *md, long task_state)
2423 {
2424         int r = 0;
2425         DEFINE_WAIT(wait);
2426
2427         while (1) {
2428                 prepare_to_wait(&md->wait, &wait, task_state);
2429
2430                 if (!md_in_flight(md))
2431                         break;
2432
2433                 if (signal_pending_state(task_state, current)) {
2434                         r = -EINTR;
2435                         break;
2436                 }
2437
2438                 io_schedule();
2439         }
2440         finish_wait(&md->wait, &wait);
2441
2442         smp_rmb();
2443
2444         return r;
2445 }
2446
2447 /*
2448  * Process the deferred bios
2449  */
2450 static void dm_wq_work(struct work_struct *work)
2451 {
2452         struct mapped_device *md = container_of(work, struct mapped_device,
2453                                                 work);
2454         struct bio *c;
2455         int srcu_idx;
2456         struct dm_table *map;
2457
2458         map = dm_get_live_table(md, &srcu_idx);
2459
2460         while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2461                 spin_lock_irq(&md->deferred_lock);
2462                 c = bio_list_pop(&md->deferred);
2463                 spin_unlock_irq(&md->deferred_lock);
2464
2465                 if (!c)
2466                         break;
2467
2468                 if (dm_request_based(md))
2469                         (void) generic_make_request(c);
2470                 else
2471                         (void) dm_process_bio(md, map, c);
2472         }
2473
2474         dm_put_live_table(md, srcu_idx);
2475 }
2476
2477 static void dm_queue_flush(struct mapped_device *md)
2478 {
2479         clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2480         smp_mb__after_atomic();
2481         queue_work(md->wq, &md->work);
2482 }
2483
2484 /*
2485  * Swap in a new table, returning the old one for the caller to destroy.
2486  */
2487 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2488 {
2489         struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2490         struct queue_limits limits;
2491         int r;
2492
2493         mutex_lock(&md->suspend_lock);
2494
2495         /* device must be suspended */
2496         if (!dm_suspended_md(md))
2497                 goto out;
2498
2499         /*
2500          * If the new table has no data devices, retain the existing limits.
2501          * This helps multipath with queue_if_no_path if all paths disappear,
2502          * then new I/O is queued based on these limits, and then some paths
2503          * reappear.
2504          */
2505         if (dm_table_has_no_data_devices(table)) {
2506                 live_map = dm_get_live_table_fast(md);
2507                 if (live_map)
2508                         limits = md->queue->limits;
2509                 dm_put_live_table_fast(md);
2510         }
2511
2512         if (!live_map) {
2513                 r = dm_calculate_queue_limits(table, &limits);
2514                 if (r) {
2515                         map = ERR_PTR(r);
2516                         goto out;
2517                 }
2518         }
2519
2520         map = __bind(md, table, &limits);
2521         dm_issue_global_event();
2522
2523 out:
2524         mutex_unlock(&md->suspend_lock);
2525         return map;
2526 }
2527
2528 /*
2529  * Functions to lock and unlock any filesystem running on the
2530  * device.
2531  */
2532 static int lock_fs(struct mapped_device *md)
2533 {
2534         int r;
2535
2536         WARN_ON(md->frozen_sb);
2537
2538         md->frozen_sb = freeze_bdev(md->bdev);
2539         if (IS_ERR(md->frozen_sb)) {
2540                 r = PTR_ERR(md->frozen_sb);
2541                 md->frozen_sb = NULL;
2542                 return r;
2543         }
2544
2545         set_bit(DMF_FROZEN, &md->flags);
2546
2547         return 0;
2548 }
2549
2550 static void unlock_fs(struct mapped_device *md)
2551 {
2552         if (!test_bit(DMF_FROZEN, &md->flags))
2553                 return;
2554
2555         thaw_bdev(md->bdev, md->frozen_sb);
2556         md->frozen_sb = NULL;
2557         clear_bit(DMF_FROZEN, &md->flags);
2558 }
2559
2560 /*
2561  * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG
2562  * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE
2563  * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY
2564  *
2565  * If __dm_suspend returns 0, the device is completely quiescent
2566  * now. There is no request-processing activity. All new requests
2567  * are being added to md->deferred list.
2568  */
2569 static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2570                         unsigned suspend_flags, long task_state,
2571                         int dmf_suspended_flag)
2572 {
2573         bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2574         bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2575         int r;
2576
2577         lockdep_assert_held(&md->suspend_lock);
2578
2579         /*
2580          * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2581          * This flag is cleared before dm_suspend returns.
2582          */
2583         if (noflush)
2584                 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2585         else
2586                 pr_debug("%s: suspending with flush\n", dm_device_name(md));
2587
2588         /*
2589          * This gets reverted if there's an error later and the targets
2590          * provide the .presuspend_undo hook.
2591          */
2592         dm_table_presuspend_targets(map);
2593
2594         /*
2595          * Flush I/O to the device.
2596          * Any I/O submitted after lock_fs() may not be flushed.
2597          * noflush takes precedence over do_lockfs.
2598          * (lock_fs() flushes I/Os and waits for them to complete.)
2599          */
2600         if (!noflush && do_lockfs) {
2601                 r = lock_fs(md);
2602                 if (r) {
2603                         dm_table_presuspend_undo_targets(map);
2604                         return r;
2605                 }
2606         }
2607
2608         /*
2609          * Here we must make sure that no processes are submitting requests
2610          * to target drivers i.e. no one may be executing
2611          * __split_and_process_bio. This is called from dm_request and
2612          * dm_wq_work.
2613          *
2614          * To get all processes out of __split_and_process_bio in dm_request,
2615          * we take the write lock. To prevent any process from reentering
2616          * __split_and_process_bio from dm_request and quiesce the thread
2617          * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
2618          * flush_workqueue(md->wq).
2619          */
2620         set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2621         if (map)
2622                 synchronize_srcu(&md->io_barrier);
2623
2624         /*
2625          * Stop md->queue before flushing md->wq in case request-based
2626          * dm defers requests to md->wq from md->queue.
2627          */
2628         if (dm_request_based(md))
2629                 dm_stop_queue(md->queue);
2630
2631         flush_workqueue(md->wq);
2632
2633         /*
2634          * At this point no more requests are entering target request routines.
2635          * We call dm_wait_for_completion to wait for all existing requests
2636          * to finish.
2637          */
2638         r = dm_wait_for_completion(md, task_state);
2639         if (!r)
2640                 set_bit(dmf_suspended_flag, &md->flags);
2641
2642         if (noflush)
2643                 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2644         if (map)
2645                 synchronize_srcu(&md->io_barrier);
2646
2647         /* were we interrupted ? */
2648         if (r < 0) {
2649                 dm_queue_flush(md);
2650
2651                 if (dm_request_based(md))
2652                         dm_start_queue(md->queue);
2653
2654                 unlock_fs(md);
2655                 dm_table_presuspend_undo_targets(map);
2656                 /* pushback list is already flushed, so skip flush */
2657         }
2658
2659         return r;
2660 }
2661
2662 /*
2663  * We need to be able to change a mapping table under a mounted
2664  * filesystem.  For example we might want to move some data in
2665  * the background.  Before the table can be swapped with
2666  * dm_bind_table, dm_suspend must be called to flush any in
2667  * flight bios and ensure that any further io gets deferred.
2668  */
2669 /*
2670  * Suspend mechanism in request-based dm.
2671  *
2672  * 1. Flush all I/Os by lock_fs() if needed.
2673  * 2. Stop dispatching any I/O by stopping the request_queue.
2674  * 3. Wait for all in-flight I/Os to be completed or requeued.
2675  *
2676  * To abort suspend, start the request_queue.
2677  */
2678 int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2679 {
2680         struct dm_table *map = NULL;
2681         int r = 0;
2682
2683 retry:
2684         mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2685
2686         if (dm_suspended_md(md)) {
2687                 r = -EINVAL;
2688                 goto out_unlock;
2689         }
2690
2691         if (dm_suspended_internally_md(md)) {
2692                 /* already internally suspended, wait for internal resume */
2693                 mutex_unlock(&md->suspend_lock);
2694                 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2695                 if (r)
2696                         return r;
2697                 goto retry;
2698         }
2699
2700         map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2701
2702         r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
2703         if (r)
2704                 goto out_unlock;
2705
2706         set_bit(DMF_POST_SUSPENDING, &md->flags);
2707         dm_table_postsuspend_targets(map);
2708         clear_bit(DMF_POST_SUSPENDING, &md->flags);
2709
2710 out_unlock:
2711         mutex_unlock(&md->suspend_lock);
2712         return r;
2713 }
2714
2715 static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2716 {
2717         if (map) {
2718                 int r = dm_table_resume_targets(map);
2719                 if (r)
2720                         return r;
2721         }
2722
2723         dm_queue_flush(md);
2724
2725         /*
2726          * Flushing deferred I/Os must be done after targets are resumed
2727          * so that mapping of targets can work correctly.
2728          * Request-based dm is queueing the deferred I/Os in its request_queue.
2729          */
2730         if (dm_request_based(md))
2731                 dm_start_queue(md->queue);
2732
2733         unlock_fs(md);
2734
2735         return 0;
2736 }
2737
2738 int dm_resume(struct mapped_device *md)
2739 {
2740         int r;
2741         struct dm_table *map = NULL;
2742
2743 retry:
2744         r = -EINVAL;
2745         mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2746
2747         if (!dm_suspended_md(md))
2748                 goto out;
2749
2750         if (dm_suspended_internally_md(md)) {
2751                 /* already internally suspended, wait for internal resume */
2752                 mutex_unlock(&md->suspend_lock);
2753                 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2754                 if (r)
2755                         return r;
2756                 goto retry;
2757         }
2758
2759         map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2760         if (!map || !dm_table_get_size(map))
2761                 goto out;
2762
2763         r = __dm_resume(md, map);
2764         if (r)
2765                 goto out;
2766
2767         clear_bit(DMF_SUSPENDED, &md->flags);
2768 out:
2769         mutex_unlock(&md->suspend_lock);
2770
2771         return r;
2772 }
2773
2774 /*
2775  * Internal suspend/resume works like userspace-driven suspend. It waits
2776  * until all bios finish and prevents issuing new bios to the target drivers.
2777  * It may be used only from the kernel.
2778  */
2779
2780 static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2781 {
2782         struct dm_table *map = NULL;
2783
2784         lockdep_assert_held(&md->suspend_lock);
2785
2786         if (md->internal_suspend_count++)
2787                 return; /* nested internal suspend */
2788
2789         if (dm_suspended_md(md)) {
2790                 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2791                 return; /* nest suspend */
2792         }
2793
2794         map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2795
2796         /*
2797          * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
2798          * supported.  Properly supporting a TASK_INTERRUPTIBLE internal suspend
2799          * would require changing .presuspend to return an error -- avoid this
2800          * until there is a need for more elaborate variants of internal suspend.
2801          */
2802         (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2803                             DMF_SUSPENDED_INTERNALLY);
2804
2805         set_bit(DMF_POST_SUSPENDING, &md->flags);
2806         dm_table_postsuspend_targets(map);
2807         clear_bit(DMF_POST_SUSPENDING, &md->flags);
2808 }
2809
2810 static void __dm_internal_resume(struct mapped_device *md)
2811 {
2812         BUG_ON(!md->internal_suspend_count);
2813
2814         if (--md->internal_suspend_count)
2815                 return; /* resume from nested internal suspend */
2816
2817         if (dm_suspended_md(md))
2818                 goto done; /* resume from nested suspend */
2819
2820         /*
2821          * NOTE: existing callers don't need to call dm_table_resume_targets
2822          * (which may fail -- so best to avoid it for now by passing NULL map)
2823          */
2824         (void) __dm_resume(md, NULL);
2825
2826 done:
2827         clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2828         smp_mb__after_atomic();
2829         wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2830 }
2831
2832 void dm_internal_suspend_noflush(struct mapped_device *md)
2833 {
2834         mutex_lock(&md->suspend_lock);
2835         __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2836         mutex_unlock(&md->suspend_lock);
2837 }
2838 EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2839
2840 void dm_internal_resume(struct mapped_device *md)
2841 {
2842         mutex_lock(&md->suspend_lock);
2843         __dm_internal_resume(md);
2844         mutex_unlock(&md->suspend_lock);
2845 }
2846 EXPORT_SYMBOL_GPL(dm_internal_resume);
2847
2848 /*
2849  * Fast variants of internal suspend/resume hold md->suspend_lock,
2850  * which prevents interaction with userspace-driven suspend.
2851  */
2852
2853 void dm_internal_suspend_fast(struct mapped_device *md)
2854 {
2855         mutex_lock(&md->suspend_lock);
2856         if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2857                 return;
2858
2859         set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2860         synchronize_srcu(&md->io_barrier);
2861         flush_workqueue(md->wq);
2862         dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2863 }
2864 EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
2865
2866 void dm_internal_resume_fast(struct mapped_device *md)
2867 {
2868         if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2869                 goto done;
2870
2871         dm_queue_flush(md);
2872
2873 done:
2874         mutex_unlock(&md->suspend_lock);
2875 }
2876 EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
2877
2878 /*-----------------------------------------------------------------
2879  * Event notification.
2880  *---------------------------------------------------------------*/
2881 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2882                        unsigned cookie)
2883 {
2884         int r;
2885         unsigned noio_flag;
2886         char udev_cookie[DM_COOKIE_LENGTH];
2887         char *envp[] = { udev_cookie, NULL };
2888
2889         noio_flag = memalloc_noio_save();
2890
2891         if (!cookie)
2892                 r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2893         else {
2894                 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2895                          DM_COOKIE_ENV_VAR_NAME, cookie);
2896                 r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2897                                        action, envp);
2898         }
2899
2900         memalloc_noio_restore(noio_flag);
2901
2902         return r;
2903 }
2904
2905 uint32_t dm_next_uevent_seq(struct mapped_device *md)
2906 {
2907         return atomic_add_return(1, &md->uevent_seq);
2908 }
2909
2910 uint32_t dm_get_event_nr(struct mapped_device *md)
2911 {
2912         return atomic_read(&md->event_nr);
2913 }
2914
2915 int dm_wait_event(struct mapped_device *md, int event_nr)
2916 {
2917         return wait_event_interruptible(md->eventq,
2918                         (event_nr != atomic_read(&md->event_nr)));
2919 }
2920
2921 void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2922 {
2923         unsigned long flags;
2924
2925         spin_lock_irqsave(&md->uevent_lock, flags);
2926         list_add(elist, &md->uevent_list);
2927         spin_unlock_irqrestore(&md->uevent_lock, flags);
2928 }
2929
2930 /*
2931  * The gendisk is only valid as long as you have a reference
2932  * count on 'md'.
2933  */
2934 struct gendisk *dm_disk(struct mapped_device *md)
2935 {
2936         return md->disk;
2937 }
2938 EXPORT_SYMBOL_GPL(dm_disk);
2939
2940 struct kobject *dm_kobject(struct mapped_device *md)
2941 {
2942         return &md->kobj_holder.kobj;
2943 }
2944
2945 struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2946 {
2947         struct mapped_device *md;
2948
2949         md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2950
2951         spin_lock(&_minor_lock);
2952         if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2953                 md = NULL;
2954                 goto out;
2955         }
2956         dm_get(md);
2957 out:
2958         spin_unlock(&_minor_lock);
2959
2960         return md;
2961 }
2962
2963 int dm_suspended_md(struct mapped_device *md)
2964 {
2965         return test_bit(DMF_SUSPENDED, &md->flags);
2966 }
2967
2968 static int dm_post_suspending_md(struct mapped_device *md)
2969 {
2970         return test_bit(DMF_POST_SUSPENDING, &md->flags);
2971 }
2972
2973 int dm_suspended_internally_md(struct mapped_device *md)
2974 {
2975         return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2976 }
2977
2978 int dm_test_deferred_remove_flag(struct mapped_device *md)
2979 {
2980         return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2981 }
2982
2983 int dm_suspended(struct dm_target *ti)
2984 {
2985         return dm_suspended_md(dm_table_get_md(ti->table));
2986 }
2987 EXPORT_SYMBOL_GPL(dm_suspended);
2988
2989 int dm_post_suspending(struct dm_target *ti)
2990 {
2991         return dm_post_suspending_md(dm_table_get_md(ti->table));
2992 }
2993 EXPORT_SYMBOL_GPL(dm_post_suspending);
2994
2995 int dm_noflush_suspending(struct dm_target *ti)
2996 {
2997         return __noflush_suspending(dm_table_get_md(ti->table));
2998 }
2999 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
3000
3001 struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
3002                                             unsigned integrity, unsigned per_io_data_size,
3003                                             unsigned min_pool_size)
3004 {
3005         struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
3006         unsigned int pool_size = 0;
3007         unsigned int front_pad, io_front_pad;
3008         int ret;
3009
3010         if (!pools)
3011                 return NULL;
3012
3013         switch (type) {
3014         case DM_TYPE_BIO_BASED:
3015         case DM_TYPE_DAX_BIO_BASED:
3016                 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
3017                 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
3018                 io_front_pad = roundup(front_pad,  __alignof__(struct dm_io)) + offsetof(struct dm_io, tio);
3019                 ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0);
3020                 if (ret)
3021                         goto out;
3022                 if (integrity && bioset_integrity_create(&pools->io_bs, pool_size))
3023                         goto out;
3024                 break;
3025         case DM_TYPE_REQUEST_BASED:
3026                 pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
3027                 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
3028                 /* per_io_data_size is used for blk-mq pdu at queue allocation */
3029                 break;
3030         default:
3031                 BUG();
3032         }
3033
3034         ret = bioset_init(&pools->bs, pool_size, front_pad, 0);
3035         if (ret)
3036                 goto out;
3037
3038         if (integrity && bioset_integrity_create(&pools->bs, pool_size))
3039                 goto out;
3040
3041         return pools;
3042
3043 out:
3044         dm_free_md_mempools(pools);
3045
3046         return NULL;
3047 }
3048
3049 void dm_free_md_mempools(struct dm_md_mempools *pools)
3050 {
3051         if (!pools)
3052                 return;
3053
3054         bioset_exit(&pools->bs);
3055         bioset_exit(&pools->io_bs);
3056
3057         kfree(pools);
3058 }
3059
3060 struct dm_pr {
3061         u64     old_key;
3062         u64     new_key;
3063         u32     flags;
3064         bool    fail_early;
3065 };
3066
3067 static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
3068                       void *data)
3069 {
3070         struct mapped_device *md = bdev->bd_disk->private_data;
3071         struct dm_table *table;
3072         struct dm_target *ti;
3073         int ret = -ENOTTY, srcu_idx;
3074
3075         table = dm_get_live_table(md, &srcu_idx);
3076         if (!table || !dm_table_get_size(table))
3077                 goto out;
3078
3079         /* We only support devices that have a single target */
3080         if (dm_table_get_num_targets(table) != 1)
3081                 goto out;
3082         ti = dm_table_get_target(table, 0);
3083
3084         if (dm_suspended_md(md)) {
3085                 ret = -EAGAIN;
3086                 goto out;
3087         }
3088
3089         ret = -EINVAL;
3090         if (!ti->type->iterate_devices)
3091                 goto out;
3092
3093         ret = ti->type->iterate_devices(ti, fn, data);
3094 out:
3095         dm_put_live_table(md, srcu_idx);
3096         return ret;
3097 }
3098
3099 /*
3100  * For register / unregister we need to manually call out to every path.
3101  */
3102 static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
3103                             sector_t start, sector_t len, void *data)
3104 {
3105         struct dm_pr *pr = data;
3106         const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
3107
3108         if (!ops || !ops->pr_register)
3109                 return -EOPNOTSUPP;
3110         return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
3111 }
3112
3113 static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
3114                           u32 flags)
3115 {
3116         struct dm_pr pr = {
3117                 .old_key        = old_key,
3118                 .new_key        = new_key,
3119                 .flags          = flags,
3120                 .fail_early     = true,
3121         };
3122         int ret;
3123
3124         ret = dm_call_pr(bdev, __dm_pr_register, &pr);
3125         if (ret && new_key) {
3126                 /* unregister all paths if we failed to register any path */
3127                 pr.old_key = new_key;
3128                 pr.new_key = 0;
3129                 pr.flags = 0;
3130                 pr.fail_early = false;
3131                 dm_call_pr(bdev, __dm_pr_register, &pr);
3132         }
3133
3134         return ret;
3135 }
3136
3137 static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
3138                          u32 flags)
3139 {
3140         struct mapped_device *md = bdev->bd_disk->private_data;
3141         const struct pr_ops *ops;
3142         int r, srcu_idx;
3143
3144         r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3145         if (r < 0)
3146                 goto out;
3147
3148         ops = bdev->bd_disk->fops->pr_ops;
3149         if (ops && ops->pr_reserve)
3150                 r = ops->pr_reserve(bdev, key, type, flags);
3151         else
3152                 r = -EOPNOTSUPP;
3153 out:
3154         dm_unprepare_ioctl(md, srcu_idx);
3155         return r;
3156 }
3157
3158 static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
3159 {
3160         struct mapped_device *md = bdev->bd_disk->private_data;
3161         const struct pr_ops *ops;
3162         int r, srcu_idx;
3163
3164         r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3165         if (r < 0)
3166                 goto out;
3167
3168         ops = bdev->bd_disk->fops->pr_ops;
3169         if (ops && ops->pr_release)
3170                 r = ops->pr_release(bdev, key, type);
3171         else
3172                 r = -EOPNOTSUPP;
3173 out:
3174         dm_unprepare_ioctl(md, srcu_idx);
3175         return r;
3176 }
3177
3178 static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
3179                          enum pr_type type, bool abort)
3180 {
3181         struct mapped_device *md = bdev->bd_disk->private_data;
3182         const struct pr_ops *ops;
3183         int r, srcu_idx;
3184
3185         r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3186         if (r < 0)
3187                 goto out;
3188
3189         ops = bdev->bd_disk->fops->pr_ops;
3190         if (ops && ops->pr_preempt)
3191                 r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
3192         else
3193                 r = -EOPNOTSUPP;
3194 out:
3195         dm_unprepare_ioctl(md, srcu_idx);
3196         return r;
3197 }
3198
3199 static int dm_pr_clear(struct block_device *bdev, u64 key)
3200 {
3201         struct mapped_device *md = bdev->bd_disk->private_data;
3202         const struct pr_ops *ops;
3203         int r, srcu_idx;
3204
3205         r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3206         if (r < 0)
3207                 goto out;
3208
3209         ops = bdev->bd_disk->fops->pr_ops;
3210         if (ops && ops->pr_clear)
3211                 r = ops->pr_clear(bdev, key);
3212         else
3213                 r = -EOPNOTSUPP;
3214 out:
3215         dm_unprepare_ioctl(md, srcu_idx);
3216         return r;
3217 }
3218
3219 static const struct pr_ops dm_pr_ops = {
3220         .pr_register    = dm_pr_register,
3221         .pr_reserve     = dm_pr_reserve,
3222         .pr_release     = dm_pr_release,
3223         .pr_preempt     = dm_pr_preempt,
3224         .pr_clear       = dm_pr_clear,
3225 };
3226
3227 static const struct block_device_operations dm_blk_dops = {
3228         .open = dm_blk_open,
3229         .release = dm_blk_close,
3230         .ioctl = dm_blk_ioctl,
3231         .getgeo = dm_blk_getgeo,
3232         .report_zones = dm_blk_report_zones,
3233         .pr_ops = &dm_pr_ops,
3234         .owner = THIS_MODULE
3235 };
3236
3237 static const struct dax_operations dm_dax_ops = {
3238         .direct_access = dm_dax_direct_access,
3239         .dax_supported = dm_dax_supported,
3240         .copy_from_iter = dm_dax_copy_from_iter,
3241         .copy_to_iter = dm_dax_copy_to_iter,
3242 };
3243
3244 /*
3245  * module hooks
3246  */
3247 module_init(dm_init);
3248 module_exit(dm_exit);
3249
3250 module_param(major, uint, 0);
3251 MODULE_PARM_DESC(major, "The major number of the device mapper");
3252
3253 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3254 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3255
3256 module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
3257 MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
3258
3259 module_param(swap_bios, int, S_IRUGO | S_IWUSR);
3260 MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs");
3261
3262 MODULE_DESCRIPTION(DM_NAME " driver");
3263 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3264 MODULE_LICENSE("GPL");