drivers/md/dm.c

   1 /*
   2  * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
   3  * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
   4  *
   5  * This file is released under the GPL.
   6  */
   7
   8 #include "dm-core.h"
   9 #include "dm-rq.h"
  10 #include "dm-uevent.h"
  11
  12 #include <linux/init.h>
  13 #include <linux/module.h>
  14 #include <linux/mutex.h>
  15 #include <linux/sched/mm.h>
  16 #include <linux/sched/signal.h>
  17 #include <linux/blkpg.h>
  18 #include <linux/bio.h>
  19 #include <linux/mempool.h>
  20 #include <linux/dax.h>
  21 #include <linux/slab.h>
  22 #include <linux/idr.h>
  23 #include <linux/uio.h>
  24 #include <linux/hdreg.h>
  25 #include <linux/delay.h>
  26 #include <linux/wait.h>
  27 #include <linux/pr.h>
  28 #include <linux/refcount.h>
  29
  30 #define DM_MSG_PREFIX "core"
  31
  32 /*
  33  * Cookies are numeric values sent with CHANGE and REMOVE
  34  * uevents while resuming, removing or renaming the device.
  35  */
  36 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
  37 #define DM_COOKIE_LENGTH 24
  38
  39 static const char *_name = DM_NAME;
  40
  41 static unsigned int major = 0;
  42 static unsigned int _major = 0;
  43
  44 static DEFINE_IDR(_minor_idr);
  45
  46 static DEFINE_SPINLOCK(_minor_lock);
  47
  48 static void do_deferred_remove(struct work_struct *w);
  49
  50 static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
  51
  52 static struct workqueue_struct *deferred_remove_workqueue;
  53
  54 atomic_t dm_global_event_nr = ATOMIC_INIT(0);
  55 DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
  56
  57 void dm_issue_global_event(void)
  58 {
  59         atomic_inc(&dm_global_event_nr);
  60         wake_up(&dm_global_eventq);
  61 }
  62
  63 /*
  64  * One of these is allocated (on-stack) per original bio.
  65  */
  66 struct clone_info {
  67         struct dm_table *map;
  68         struct bio *bio;
  69         struct dm_io *io;
  70         sector_t sector;
  71         unsigned sector_count;
  72 };
  73
  74 /*
  75  * One of these is allocated per clone bio.
  76  */
  77 #define DM_TIO_MAGIC 7282014
  78 struct dm_target_io {
  79         unsigned magic;
  80         struct dm_io *io;
  81         struct dm_target *ti;
  82         unsigned target_bio_nr;
  83         unsigned *len_ptr;
  84         bool inside_dm_io;
  85         struct bio clone;
  86 };
  87
  88 /*
  89  * One of these is allocated per original bio.
  90  * It contains the first clone used for that original.
  91  */
  92 #define DM_IO_MAGIC 5191977
  93 struct dm_io {
  94         unsigned magic;
  95         struct mapped_device *md;
  96         blk_status_t status;
  97         atomic_t io_count;
  98         struct bio *orig_bio;
  99         unsigned long start_time;
 100         spinlock_t endio_lock;
 101         struct dm_stats_aux stats_aux;
 102         /* last member of dm_target_io is 'struct bio' */
 103         struct dm_target_io tio;
 104 };
 105
 106 void *dm_per_bio_data(struct bio *bio, size_t data_size)
 107 {
 108         struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
 109         if (!tio->inside_dm_io)
 110                 return (char *)bio - offsetof(struct dm_target_io, clone) - data_size;
 111         return (char *)bio - offsetof(struct dm_target_io, clone) - offsetof(struct dm_io, tio) - data_size;
 112 }
 113 EXPORT_SYMBOL_GPL(dm_per_bio_data);
 114
 115 struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
 116 {
 117         struct dm_io *io = (struct dm_io *)((char *)data + data_size);
 118         if (io->magic == DM_IO_MAGIC)
 119                 return (struct bio *)((char *)io + offsetof(struct dm_io, tio) + offsetof(struct dm_target_io, clone));
 120         BUG_ON(io->magic != DM_TIO_MAGIC);
 121         return (struct bio *)((char *)io + offsetof(struct dm_target_io, clone));
 122 }
 123 EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
 124
 125 unsigned dm_bio_get_target_bio_nr(const struct bio *bio)
 126 {
 127         return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
 128 }
 129 EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
 130
 131 #define MINOR_ALLOCED ((void *)-1)
 132
 133 /*
 134  * Bits for the md->flags field.
 135  */
 136 #define DMF_BLOCK_IO_FOR_SUSPEND 0
 137 #define DMF_SUSPENDED 1
 138 #define DMF_FROZEN 2
 139 #define DMF_FREEING 3
 140 #define DMF_DELETING 4
 141 #define DMF_NOFLUSH_SUSPENDING 5
 142 #define DMF_DEFERRED_REMOVE 6
 143 #define DMF_SUSPENDED_INTERNALLY 7
 144 #define DMF_POST_SUSPENDING 8
 145
 146 #define DM_NUMA_NODE NUMA_NO_NODE
 147 static int dm_numa_node = DM_NUMA_NODE;
 148
 149 #define DEFAULT_SWAP_BIOS       (8 * 1048576 / PAGE_SIZE)
 150 static int swap_bios = DEFAULT_SWAP_BIOS;
 151 static int get_swap_bios(void)
 152 {
 153         int latch = READ_ONCE(swap_bios);
 154         if (unlikely(latch <= 0))
 155                 latch = DEFAULT_SWAP_BIOS;
 156         return latch;
 157 }
 158
 159 /*
 160  * For mempools pre-allocation at the table loading time.
 161  */
 162 struct dm_md_mempools {
 163         struct bio_set bs;
 164         struct bio_set io_bs;
 165 };
 166
 167 struct table_device {
 168         struct list_head list;
 169         refcount_t count;
 170         struct dm_dev dm_dev;
 171 };
 172
 173 static struct kmem_cache *_rq_tio_cache;
 174 static struct kmem_cache *_rq_cache;
 175
 176 /*
 177  * Bio-based DM's mempools' reserved IOs set by the user.
 178  */
 179 #define RESERVED_BIO_BASED_IOS          16
 180 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
 181
 182 static int __dm_get_module_param_int(int *module_param, int min, int max)
 183 {
 184         int param = READ_ONCE(*module_param);
 185         int modified_param = 0;
 186         bool modified = true;
 187
 188         if (param < min)
 189                 modified_param = min;
 190         else if (param > max)
 191                 modified_param = max;
 192         else
 193                 modified = false;
 194
 195         if (modified) {
 196                 (void)cmpxchg(module_param, param, modified_param);
 197                 param = modified_param;
 198         }
 199
 200         return param;
 201 }
 202
 203 unsigned __dm_get_module_param(unsigned *module_param,
 204                                unsigned def, unsigned max)
 205 {
 206         unsigned param = READ_ONCE(*module_param);
 207         unsigned modified_param = 0;
 208
 209         if (!param)
 210                 modified_param = def;
 211         else if (param > max)
 212                 modified_param = max;
 213
 214         if (modified_param) {
 215                 (void)cmpxchg(module_param, param, modified_param);
 216                 param = modified_param;
 217         }
 218
 219         return param;
 220 }
 221
 222 unsigned dm_get_reserved_bio_based_ios(void)
 223 {
 224         return __dm_get_module_param(&reserved_bio_based_ios,
 225                                      RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
 226 }
 227 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
 228
 229 static unsigned dm_get_numa_node(void)
 230 {
 231         return __dm_get_module_param_int(&dm_numa_node,
 232                                          DM_NUMA_NODE, num_online_nodes() - 1);
 233 }
 234
 235 static int __init local_init(void)
 236 {
 237         int r = -ENOMEM;
 238
 239         _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
 240         if (!_rq_tio_cache)
 241                 return r;
 242
 243         _rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request),
 244                                       __alignof__(struct request), 0, NULL);
 245         if (!_rq_cache)
 246                 goto out_free_rq_tio_cache;
 247
 248         r = dm_uevent_init();
 249         if (r)
 250                 goto out_free_rq_cache;
 251
 252         deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
 253         if (!deferred_remove_workqueue) {
 254                 r = -ENOMEM;
 255                 goto out_uevent_exit;
 256         }
 257
 258         _major = major;
 259         r = register_blkdev(_major, _name);
 260         if (r < 0)
 261                 goto out_free_workqueue;
 262
 263         if (!_major)
 264                 _major = r;
 265
 266         return 0;
 267
 268 out_free_workqueue:
 269         destroy_workqueue(deferred_remove_workqueue);
 270 out_uevent_exit:
 271         dm_uevent_exit();
 272 out_free_rq_cache:
 273         kmem_cache_destroy(_rq_cache);
 274 out_free_rq_tio_cache:
 275         kmem_cache_destroy(_rq_tio_cache);
 276
 277         return r;
 278 }
 279
 280 static void local_exit(void)
 281 {
 282         flush_scheduled_work();
 283         destroy_workqueue(deferred_remove_workqueue);
 284
 285         kmem_cache_destroy(_rq_cache);
 286         kmem_cache_destroy(_rq_tio_cache);
 287         unregister_blkdev(_major, _name);
 288         dm_uevent_exit();
 289
 290         _major = 0;
 291
 292         DMINFO("cleaned up");
 293 }
 294
 295 static int (*_inits[])(void) __initdata = {
 296         local_init,
 297         dm_target_init,
 298         dm_linear_init,
 299         dm_stripe_init,
 300         dm_io_init,
 301         dm_kcopyd_init,
 302         dm_interface_init,
 303         dm_statistics_init,
 304 };
 305
 306 static void (*_exits[])(void) = {
 307         local_exit,
 308         dm_target_exit,
 309         dm_linear_exit,
 310         dm_stripe_exit,
 311         dm_io_exit,
 312         dm_kcopyd_exit,
 313         dm_interface_exit,
 314         dm_statistics_exit,
 315 };
 316
 317 static int __init dm_init(void)
 318 {
 319         const int count = ARRAY_SIZE(_inits);
 320
 321         int r, i;
 322
 323         for (i = 0; i < count; i++) {
 324                 r = _inits[i]();
 325                 if (r)
 326                         goto bad;
 327         }
 328
 329         return 0;
 330
 331       bad:
 332         while (i--)
 333                 _exits[i]();
 334
 335         return r;
 336 }
 337
 338 static void __exit dm_exit(void)
 339 {
 340         int i = ARRAY_SIZE(_exits);
 341
 342         while (i--)
 343                 _exits[i]();
 344
 345         /*
 346          * Should be empty by this point.
 347          */
 348         idr_destroy(&_minor_idr);
 349 }
 350
 351 /*
 352  * Block device functions
 353  */
 354 int dm_deleting_md(struct mapped_device *md)
 355 {
 356         return test_bit(DMF_DELETING, &md->flags);
 357 }
 358
 359 static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 360 {
 361         struct mapped_device *md;
 362
 363         spin_lock(&_minor_lock);
 364
 365         md = bdev->bd_disk->private_data;
 366         if (!md)
 367                 goto out;
 368
 369         if (test_bit(DMF_FREEING, &md->flags) ||
 370             dm_deleting_md(md)) {
 371                 md = NULL;
 372                 goto out;
 373         }
 374
 375         dm_get(md);
 376         atomic_inc(&md->open_count);
 377 out:
 378         spin_unlock(&_minor_lock);
 379
 380         return md ? 0 : -ENXIO;
 381 }
 382
 383 static void dm_blk_close(struct gendisk *disk, fmode_t mode)
 384 {
 385         struct mapped_device *md;
 386
 387         spin_lock(&_minor_lock);
 388
 389         md = disk->private_data;
 390         if (WARN_ON(!md))
 391                 goto out;
 392
 393         if (atomic_dec_and_test(&md->open_count) &&
 394             (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
 395                 queue_work(deferred_remove_workqueue, &deferred_remove_work);
 396
 397         dm_put(md);
 398 out:
 399         spin_unlock(&_minor_lock);
 400 }
 401
 402 int dm_open_count(struct mapped_device *md)
 403 {
 404         return atomic_read(&md->open_count);
 405 }
 406
 407 /*
 408  * Guarantees nothing is using the device before it's deleted.
 409  */
 410 int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
 411 {
 412         int r = 0;
 413
 414         spin_lock(&_minor_lock);
 415
 416         if (dm_open_count(md)) {
 417                 r = -EBUSY;
 418                 if (mark_deferred)
 419                         set_bit(DMF_DEFERRED_REMOVE, &md->flags);
 420         } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
 421                 r = -EEXIST;
 422         else
 423                 set_bit(DMF_DELETING, &md->flags);
 424
 425         spin_unlock(&_minor_lock);
 426
 427         return r;
 428 }
 429
 430 int dm_cancel_deferred_remove(struct mapped_device *md)
 431 {
 432         int r = 0;
 433
 434         spin_lock(&_minor_lock);
 435
 436         if (test_bit(DMF_DELETING, &md->flags))
 437                 r = -EBUSY;
 438         else
 439                 clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
 440
 441         spin_unlock(&_minor_lock);
 442
 443         return r;
 444 }
 445
 446 static void do_deferred_remove(struct work_struct *w)
 447 {
 448         dm_deferred_remove();
 449 }
 450
 451 sector_t dm_get_size(struct mapped_device *md)
 452 {
 453         return get_capacity(md->disk);
 454 }
 455
 456 struct request_queue *dm_get_md_queue(struct mapped_device *md)
 457 {
 458         return md->queue;
 459 }
 460
 461 struct dm_stats *dm_get_stats(struct mapped_device *md)
 462 {
 463         return &md->stats;
 464 }
 465
 466 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 467 {
 468         struct mapped_device *md = bdev->bd_disk->private_data;
 469
 470         return dm_get_geometry(md, geo);
 471 }
 472
 473 static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
 474                             struct block_device **bdev)
 475 {
 476         struct dm_target *tgt;
 477         struct dm_table *map;
 478         int r;
 479
 480 retry:
 481         r = -ENOTTY;
 482         map = dm_get_live_table(md, srcu_idx);
 483         if (!map || !dm_table_get_size(map))
 484                 return r;
 485
 486         /* We only support devices that have a single target */
 487         if (dm_table_get_num_targets(map) != 1)
 488                 return r;
 489
 490         tgt = dm_table_get_target(map, 0);
 491         if (!tgt->type->prepare_ioctl)
 492                 return r;
 493
 494         if (dm_suspended_md(md))
 495                 return -EAGAIN;
 496
 497         r = tgt->type->prepare_ioctl(tgt, bdev);
 498         if (r == -ENOTCONN && !fatal_signal_pending(current)) {
 499                 dm_put_live_table(md, *srcu_idx);
 500                 msleep(10);
 501                 goto retry;
 502         }
 503
 504         return r;
 505 }
 506
 507 static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
 508 {
 509         dm_put_live_table(md, srcu_idx);
 510 }
 511
 512 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 513                         unsigned int cmd, unsigned long arg)
 514 {
 515         struct mapped_device *md = bdev->bd_disk->private_data;
 516         int r, srcu_idx;
 517
 518         r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
 519         if (r < 0)
 520                 goto out;
 521
 522         if (r > 0) {
 523                 /*
 524                  * Target determined this ioctl is being issued against a
 525                  * subset of the parent bdev; require extra privileges.
 526                  */
 527                 if (!capable(CAP_SYS_RAWIO)) {
 528                         DMDEBUG_LIMIT(
 529         "%s: sending ioctl %x to DM device without required privilege.",
 530                                 current->comm, cmd);
 531                         r = -ENOIOCTLCMD;
 532                         goto out;
 533                 }
 534         }
 535
 536         r =  __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 537 out:
 538         dm_unprepare_ioctl(md, srcu_idx);
 539         return r;
 540 }
 541
 542 static void start_io_acct(struct dm_io *io);
 543
 544 static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
 545 {
 546         struct dm_io *io;
 547         struct dm_target_io *tio;
 548         struct bio *clone;
 549
 550         clone = bio_alloc_bioset(GFP_NOIO, 0, &md->io_bs);
 551         if (!clone)
 552                 return NULL;
 553
 554         tio = container_of(clone, struct dm_target_io, clone);
 555         tio->inside_dm_io = true;
 556         tio->io = NULL;
 557
 558         io = container_of(tio, struct dm_io, tio);
 559         io->magic = DM_IO_MAGIC;
 560         io->status = 0;
 561         atomic_set(&io->io_count, 1);
 562         io->orig_bio = bio;
 563         io->md = md;
 564         spin_lock_init(&io->endio_lock);
 565
 566         start_io_acct(io);
 567
 568         return io;
 569 }
 570
 571 static void free_io(struct mapped_device *md, struct dm_io *io)
 572 {
 573         bio_put(&io->tio.clone);
 574 }
 575
 576 static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *ti,
 577                                       unsigned target_bio_nr, gfp_t gfp_mask)
 578 {
 579         struct dm_target_io *tio;
 580
 581         if (!ci->io->tio.io) {
 582                 /* the dm_target_io embedded in ci->io is available */
 583                 tio = &ci->io->tio;
 584         } else {
 585                 struct bio *clone = bio_alloc_bioset(gfp_mask, 0, &ci->io->md->bs);
 586                 if (!clone)
 587                         return NULL;
 588
 589                 tio = container_of(clone, struct dm_target_io, clone);
 590                 tio->inside_dm_io = false;
 591         }
 592
 593         tio->magic = DM_TIO_MAGIC;
 594         tio->io = ci->io;
 595         tio->ti = ti;
 596         tio->target_bio_nr = target_bio_nr;
 597
 598         return tio;
 599 }
 600
 601 static void free_tio(struct dm_target_io *tio)
 602 {
 603         if (tio->inside_dm_io)
 604                 return;
 605         bio_put(&tio->clone);
 606 }
 607
 608 int md_in_flight(struct mapped_device *md)
 609 {
 610         return atomic_read(&md->pending[READ]) +
 611                atomic_read(&md->pending[WRITE]);
 612 }
 613
 614 static void start_io_acct(struct dm_io *io)
 615 {
 616         struct mapped_device *md = io->md;
 617         struct bio *bio = io->orig_bio;
 618         int rw = bio_data_dir(bio);
 619
 620         io->start_time = jiffies;
 621
 622         generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
 623                               &dm_disk(md)->part0);
 624
 625         atomic_set(&dm_disk(md)->part0.in_flight[rw],
 626                    atomic_inc_return(&md->pending[rw]));
 627
 628         if (unlikely(dm_stats_used(&md->stats)))
 629                 dm_stats_account_io(&md->stats, bio_data_dir(bio),
 630                                     bio->bi_iter.bi_sector, bio_sectors(bio),
 631                                     false, 0, &io->stats_aux);
 632 }
 633
 634 static void end_io_acct(struct mapped_device *md, struct bio *bio,
 635                         unsigned long start_time, struct dm_stats_aux *stats_aux)
 636 {
 637         unsigned long duration = jiffies - start_time;
 638         int pending;
 639         int rw = bio_data_dir(bio);
 640
 641         generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
 642                             start_time);
 643
 644         if (unlikely(dm_stats_used(&md->stats)))
 645                 dm_stats_account_io(&md->stats, bio_data_dir(bio),
 646                                     bio->bi_iter.bi_sector, bio_sectors(bio),
 647                                     true, duration, stats_aux);
 648
 649         /*
 650          * After this is decremented the bio must not be touched if it is
 651          * a flush.
 652          */
 653         pending = atomic_dec_return(&md->pending[rw]);
 654         atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
 655         pending += atomic_read(&md->pending[rw^0x1]);
 656
 657         /* nudge anyone waiting on suspend queue */
 658         if (!pending)
 659                 wake_up(&md->wait);
 660 }
 661
 662 /*
 663  * Add the bio to the list of deferred io.
 664  */
 665 static void queue_io(struct mapped_device *md, struct bio *bio)
 666 {
 667         unsigned long flags;
 668
 669         spin_lock_irqsave(&md->deferred_lock, flags);
 670         bio_list_add(&md->deferred, bio);
 671         spin_unlock_irqrestore(&md->deferred_lock, flags);
 672         queue_work(md->wq, &md->work);
 673 }
 674
 675 /*
 676  * Everyone (including functions in this file), should use this
 677  * function to access the md->map field, and make sure they call
 678  * dm_put_live_table() when finished.
 679  */
 680 struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
 681 {
 682         *srcu_idx = srcu_read_lock(&md->io_barrier);
 683
 684         return srcu_dereference(md->map, &md->io_barrier);
 685 }
 686
 687 void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
 688 {
 689         srcu_read_unlock(&md->io_barrier, srcu_idx);
 690 }
 691
 692 void dm_sync_table(struct mapped_device *md)
 693 {
 694         synchronize_srcu(&md->io_barrier);
 695         synchronize_rcu_expedited();
 696 }
 697
 698 /*
 699  * A fast alternative to dm_get_live_table/dm_put_live_table.
 700  * The caller must not block between these two functions.
 701  */
 702 static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
 703 {
 704         rcu_read_lock();
 705         return rcu_dereference(md->map);
 706 }
 707
 708 static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
 709 {
 710         rcu_read_unlock();
 711 }
 712
 713 static char *_dm_claim_ptr = "I belong to device-mapper";
 714
 715 /*
 716  * Open a table device so we can use it as a map destination.
 717  */
 718 static int open_table_device(struct table_device *td, dev_t dev,
 719                              struct mapped_device *md)
 720 {
 721         struct block_device *bdev;
 722
 723         int r;
 724
 725         BUG_ON(td->dm_dev.bdev);
 726
 727         bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
 728         if (IS_ERR(bdev))
 729                 return PTR_ERR(bdev);
 730
 731         r = bd_link_disk_holder(bdev, dm_disk(md));
 732         if (r) {
 733                 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
 734                 return r;
 735         }
 736
 737         td->dm_dev.bdev = bdev;
 738         td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
 739         return 0;
 740 }
 741
 742 /*
 743  * Close a table device that we've been using.
 744  */
 745 static void close_table_device(struct table_device *td, struct mapped_device *md)
 746 {
 747         if (!td->dm_dev.bdev)
 748                 return;
 749
 750         bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
 751         blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
 752         put_dax(td->dm_dev.dax_dev);
 753         td->dm_dev.bdev = NULL;
 754         td->dm_dev.dax_dev = NULL;
 755 }
 756
 757 static struct table_device *find_table_device(struct list_head *l, dev_t dev,
 758                                               fmode_t mode) {
 759         struct table_device *td;
 760
 761         list_for_each_entry(td, l, list)
 762                 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
 763                         return td;
 764
 765         return NULL;
 766 }
 767
 768 int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
 769                         struct dm_dev **result) {
 770         int r;
 771         struct table_device *td;
 772
 773         mutex_lock(&md->table_devices_lock);
 774         td = find_table_device(&md->table_devices, dev, mode);
 775         if (!td) {
 776                 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
 777                 if (!td) {
 778                         mutex_unlock(&md->table_devices_lock);
 779                         return -ENOMEM;
 780                 }
 781
 782                 td->dm_dev.mode = mode;
 783                 td->dm_dev.bdev = NULL;
 784
 785                 if ((r = open_table_device(td, dev, md))) {
 786                         mutex_unlock(&md->table_devices_lock);
 787                         kfree(td);
 788                         return r;
 789                 }
 790
 791                 format_dev_t(td->dm_dev.name, dev);
 792
 793                 refcount_set(&td->count, 1);
 794                 list_add(&td->list, &md->table_devices);
 795         } else {
 796                 refcount_inc(&td->count);
 797         }
 798         mutex_unlock(&md->table_devices_lock);
 799
 800         *result = &td->dm_dev;
 801         return 0;
 802 }
 803 EXPORT_SYMBOL_GPL(dm_get_table_device);
 804
 805 void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
 806 {
 807         struct table_device *td = container_of(d, struct table_device, dm_dev);
 808
 809         mutex_lock(&md->table_devices_lock);
 810         if (refcount_dec_and_test(&td->count)) {
 811                 close_table_device(td, md);
 812                 list_del(&td->list);
 813                 kfree(td);
 814         }
 815         mutex_unlock(&md->table_devices_lock);
 816 }
 817 EXPORT_SYMBOL(dm_put_table_device);
 818
 819 static void free_table_devices(struct list_head *devices)
 820 {
 821         struct list_head *tmp, *next;
 822
 823         list_for_each_safe(tmp, next, devices) {
 824                 struct table_device *td = list_entry(tmp, struct table_device, list);
 825
 826                 DMWARN("dm_destroy: %s still exists with %d references",
 827                        td->dm_dev.name, refcount_read(&td->count));
 828                 kfree(td);
 829         }
 830 }
 831
 832 /*
 833  * Get the geometry associated with a dm device
 834  */
 835 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
 836 {
 837         *geo = md->geometry;
 838
 839         return 0;
 840 }
 841
 842 /*
 843  * Set the geometry of a device.
 844  */
 845 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
 846 {
 847         sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
 848
 849         if (geo->start > sz) {
 850                 DMWARN("Start sector is beyond the geometry limits.");
 851                 return -EINVAL;
 852         }
 853
 854         md->geometry = *geo;
 855
 856         return 0;
 857 }
 858
 859 static int __noflush_suspending(struct mapped_device *md)
 860 {
 861         return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 862 }
 863
 864 /*
 865  * Decrements the number of outstanding ios that a bio has been
 866  * cloned into, completing the original io if necc.
 867  */
 868 static void dec_pending(struct dm_io *io, blk_status_t error)
 869 {
 870         unsigned long flags;
 871         blk_status_t io_error;
 872         struct bio *bio;
 873         struct mapped_device *md = io->md;
 874         unsigned long start_time = 0;
 875         struct dm_stats_aux stats_aux;
 876
 877         /* Push-back supersedes any I/O errors */
 878         if (unlikely(error)) {
 879                 spin_lock_irqsave(&io->endio_lock, flags);
 880                 if (!(io->status == BLK_STS_DM_REQUEUE && __noflush_suspending(md)))
 881                         io->status = error;
 882                 spin_unlock_irqrestore(&io->endio_lock, flags);
 883         }
 884
 885         if (atomic_dec_and_test(&io->io_count)) {
 886                 if (io->status == BLK_STS_DM_REQUEUE) {
 887                         /*
 888                          * Target requested pushing back the I/O.
 889                          */
 890                         spin_lock_irqsave(&md->deferred_lock, flags);
 891                         if (__noflush_suspending(md))
 892                                 /* NOTE early return due to BLK_STS_DM_REQUEUE below */
 893                                 bio_list_add_head(&md->deferred, io->orig_bio);
 894                         else
 895                                 /* noflush suspend was interrupted. */
 896                                 io->status = BLK_STS_IOERR;
 897                         spin_unlock_irqrestore(&md->deferred_lock, flags);
 898                 }
 899
 900                 io_error = io->status;
 901                 bio = io->orig_bio;
 902                 start_time = io->start_time;
 903                 stats_aux = io->stats_aux;
 904                 free_io(md, io);
 905                 end_io_acct(md, bio, start_time, &stats_aux);
 906
 907                 if (io_error == BLK_STS_DM_REQUEUE)
 908                         return;
 909
 910                 if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
 911                         /*
 912                          * Preflush done for flush with data, reissue
 913                          * without REQ_PREFLUSH.
 914                          */
 915                         bio->bi_opf &= ~REQ_PREFLUSH;
 916                         queue_io(md, bio);
 917                 } else {
 918                         /* done with normal IO or empty flush */
 919                         if (io_error)
 920                                 bio->bi_status = io_error;
 921                         bio_endio(bio);
 922                 }
 923         }
 924 }
 925
 926 void disable_discard(struct mapped_device *md)
 927 {
 928         struct queue_limits *limits = dm_get_queue_limits(md);
 929
 930         /* device doesn't really support DISCARD, disable it */
 931         limits->max_discard_sectors = 0;
 932         blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue);
 933 }
 934
 935 void disable_write_same(struct mapped_device *md)
 936 {
 937         struct queue_limits *limits = dm_get_queue_limits(md);
 938
 939         /* device doesn't really support WRITE SAME, disable it */
 940         limits->max_write_same_sectors = 0;
 941 }
 942
 943 void disable_write_zeroes(struct mapped_device *md)
 944 {
 945         struct queue_limits *limits = dm_get_queue_limits(md);
 946
 947         /* device doesn't really support WRITE ZEROES, disable it */
 948         limits->max_write_zeroes_sectors = 0;
 949 }
 950
 951 static bool swap_bios_limit(struct dm_target *ti, struct bio *bio)
 952 {
 953         return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios);
 954 }
 955
 956 static void clone_endio(struct bio *bio)
 957 {
 958         blk_status_t error = bio->bi_status;
 959         struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
 960         struct dm_io *io = tio->io;
 961         struct mapped_device *md = tio->io->md;
 962         dm_endio_fn endio = tio->ti->type->end_io;
 963
 964         if (unlikely(error == BLK_STS_TARGET) && md->type != DM_TYPE_NVME_BIO_BASED) {
 965                 if (bio_op(bio) == REQ_OP_DISCARD &&
 966                     !bio->bi_disk->queue->limits.max_discard_sectors)
 967                         disable_discard(md);
 968                 else if (bio_op(bio) == REQ_OP_WRITE_SAME &&
 969                          !bio->bi_disk->queue->limits.max_write_same_sectors)
 970                         disable_write_same(md);
 971                 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
 972                          !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
 973                         disable_write_zeroes(md);
 974         }
 975
 976         if (endio) {
 977                 int r = endio(tio->ti, bio, &error);
 978                 switch (r) {
 979                 case DM_ENDIO_REQUEUE:
 980                         error = BLK_STS_DM_REQUEUE;
 981                         /*FALLTHRU*/
 982                 case DM_ENDIO_DONE:
 983                         break;
 984                 case DM_ENDIO_INCOMPLETE:
 985                         /* The target will handle the io */
 986                         return;
 987                 default:
 988                         DMWARN("unimplemented target endio return value: %d", r);
 989                         BUG();
 990                 }
 991         }
 992
 993         if (unlikely(swap_bios_limit(tio->ti, bio))) {
 994                 struct mapped_device *md = io->md;
 995                 up(&md->swap_bios_semaphore);
 996         }
 997
 998         free_tio(tio);
 999         dec_pending(io, error);
1000 }
1001
1002 /*
1003  * Return maximum size of I/O possible at the supplied sector up to the current
1004  * target boundary.
1005  */
1006 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
1007 {
1008         sector_t target_offset = dm_target_offset(ti, sector);
1009
1010         return ti->len - target_offset;
1011 }
1012
1013 static sector_t max_io_len(sector_t sector, struct dm_target *ti)
1014 {
1015         sector_t len = max_io_len_target_boundary(sector, ti);
1016         sector_t offset, max_len;
1017
1018         /*
1019          * Does the target need to split even further?
1020          */
1021         if (ti->max_io_len) {
1022                 offset = dm_target_offset(ti, sector);
1023                 if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
1024                         max_len = sector_div(offset, ti->max_io_len);
1025                 else
1026                         max_len = offset & (ti->max_io_len - 1);
1027                 max_len = ti->max_io_len - max_len;
1028
1029                 if (len > max_len)
1030                         len = max_len;
1031         }
1032
1033         return len;
1034 }
1035
1036 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
1037 {
1038         if (len > UINT_MAX) {
1039                 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
1040                       (unsigned long long)len, UINT_MAX);
1041                 ti->error = "Maximum size of target IO is too large";
1042                 return -EINVAL;
1043         }
1044
1045         ti->max_io_len = (uint32_t) len;
1046
1047         return 0;
1048 }
1049 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1050
1051 static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
1052                                                 sector_t sector, int *srcu_idx)
1053         __acquires(md->io_barrier)
1054 {
1055         struct dm_table *map;
1056         struct dm_target *ti;
1057
1058         map = dm_get_live_table(md, srcu_idx);
1059         if (!map)
1060                 return NULL;
1061
1062         ti = dm_table_find_target(map, sector);
1063         if (!dm_target_is_valid(ti))
1064                 return NULL;
1065
1066         return ti;
1067 }
1068
1069 static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
1070                                  long nr_pages, void **kaddr, pfn_t *pfn)
1071 {
1072         struct mapped_device *md = dax_get_private(dax_dev);
1073         sector_t sector = pgoff * PAGE_SECTORS;
1074         struct dm_target *ti;
1075         long len, ret = -EIO;
1076         int srcu_idx;
1077
1078         ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1079
1080         if (!ti)
1081                 goto out;
1082         if (!ti->type->direct_access)
1083                 goto out;
1084         len = max_io_len(sector, ti) / PAGE_SECTORS;
1085         if (len < 1)
1086                 goto out;
1087         nr_pages = min(len, nr_pages);
1088         ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
1089
1090  out:
1091         dm_put_live_table(md, srcu_idx);
1092
1093         return ret;
1094 }
1095
1096 static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1097                                     void *addr, size_t bytes, struct iov_iter *i)
1098 {
1099         struct mapped_device *md = dax_get_private(dax_dev);
1100         sector_t sector = pgoff * PAGE_SECTORS;
1101         struct dm_target *ti;
1102         long ret = 0;
1103         int srcu_idx;
1104
1105         ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1106
1107         if (!ti)
1108                 goto out;
1109         if (!ti->type->dax_copy_from_iter) {
1110                 ret = copy_from_iter(addr, bytes, i);
1111                 goto out;
1112         }
1113         ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i);
1114  out:
1115         dm_put_live_table(md, srcu_idx);
1116
1117         return ret;
1118 }
1119
1120 static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1121                 void *addr, size_t bytes, struct iov_iter *i)
1122 {
1123         struct mapped_device *md = dax_get_private(dax_dev);
1124         sector_t sector = pgoff * PAGE_SECTORS;
1125         struct dm_target *ti;
1126         long ret = 0;
1127         int srcu_idx;
1128
1129         ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1130
1131         if (!ti)
1132                 goto out;
1133         if (!ti->type->dax_copy_to_iter) {
1134                 ret = copy_to_iter(addr, bytes, i);
1135                 goto out;
1136         }
1137         ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i);
1138  out:
1139         dm_put_live_table(md, srcu_idx);
1140
1141         return ret;
1142 }
1143
1144 /*
1145  * A target may call dm_accept_partial_bio only from the map routine.  It is
1146  * allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET.
1147  *
1148  * dm_accept_partial_bio informs the dm that the target only wants to process
1149  * additional n_sectors sectors of the bio and the rest of the data should be
1150  * sent in a next bio.
1151  *
1152  * A diagram that explains the arithmetics:
1153  * +--------------------+---------------+-------+
1154  * |         1          |       2       |   3   |
1155  * +--------------------+---------------+-------+
1156  *
1157  * <-------------- *tio->len_ptr --------------->
1158  *                      <------- bi_size ------->
1159  *                      <-- n_sectors -->
1160  *
1161  * Region 1 was already iterated over with bio_advance or similar function.
1162  *      (it may be empty if the target doesn't use bio_advance)
1163  * Region 2 is the remaining bio size that the target wants to process.
1164  *      (it may be empty if region 1 is non-empty, although there is no reason
1165  *       to make it empty)
1166  * The target requires that region 3 is to be sent in the next bio.
1167  *
1168  * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
1169  * the partially processed part (the sum of regions 1+2) must be the same for all
1170  * copies of the bio.
1171  */
1172 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1173 {
1174         struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1175         unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1176         BUG_ON(bio->bi_opf & REQ_PREFLUSH);
1177         BUG_ON(bi_size > *tio->len_ptr);
1178         BUG_ON(n_sectors > bi_size);
1179         *tio->len_ptr -= bi_size - n_sectors;
1180         bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1181 }
1182 EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1183
1184 /*
1185  * The zone descriptors obtained with a zone report indicate zone positions
1186  * within the target backing device, regardless of that device is a partition
1187  * and regardless of the target mapping start sector on the device or partition.
1188  * The zone descriptors start sector and write pointer position must be adjusted
1189  * to match their relative position within the dm device.
1190  * A target may call dm_remap_zone_report() after completion of a
1191  * REQ_OP_ZONE_REPORT bio to remap the zone descriptors obtained from the
1192  * backing device.
1193  */
1194 void dm_remap_zone_report(struct dm_target *ti, struct bio *bio, sector_t start)
1195 {
1196 #ifdef CONFIG_BLK_DEV_ZONED
1197         struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1198         struct bio *report_bio = tio->io->orig_bio;
1199         struct blk_zone_report_hdr *hdr = NULL;
1200         struct blk_zone *zone;
1201         unsigned int nr_rep = 0;
1202         unsigned int ofst;
1203         sector_t part_offset;
1204         struct bio_vec bvec;
1205         struct bvec_iter iter;
1206         void *addr;
1207
1208         if (bio->bi_status)
1209                 return;
1210
1211         /*
1212          * bio sector was incremented by the request size on completion. Taking
1213          * into account the original request sector, the target start offset on
1214          * the backing device and the target mapping offset (ti->begin), the
1215          * start sector of the backing device. The partition offset is always 0
1216          * if the target uses a whole device.
1217          */
1218         part_offset = bio->bi_iter.bi_sector + ti->begin - (start + bio_end_sector(report_bio));
1219
1220         /*
1221          * Remap the start sector of the reported zones. For sequential zones,
1222          * also remap the write pointer position.
1223          */
1224         bio_for_each_segment(bvec, report_bio, iter) {
1225                 addr = kmap_atomic(bvec.bv_page);
1226
1227                 /* Remember the report header in the first page */
1228                 if (!hdr) {
1229                         hdr = addr;
1230                         ofst = sizeof(struct blk_zone_report_hdr);
1231                 } else
1232                         ofst = 0;
1233
1234                 /* Set zones start sector */
1235                 while (hdr->nr_zones && ofst < bvec.bv_len) {
1236                         zone = addr + ofst;
1237                         zone->start -= part_offset;
1238                         if (zone->start >= start + ti->len) {
1239                                 hdr->nr_zones = 0;
1240                                 break;
1241                         }
1242                         zone->start = zone->start + ti->begin - start;
1243                         if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
1244                                 if (zone->cond == BLK_ZONE_COND_FULL)
1245                                         zone->wp = zone->start + zone->len;
1246                                 else if (zone->cond == BLK_ZONE_COND_EMPTY)
1247                                         zone->wp = zone->start;
1248                                 else
1249                                         zone->wp = zone->wp + ti->begin - start - part_offset;
1250                         }
1251                         ofst += sizeof(struct blk_zone);
1252                         hdr->nr_zones--;
1253                         nr_rep++;
1254                 }
1255
1256                 if (addr != hdr)
1257                         kunmap_atomic(addr);
1258
1259                 if (!hdr->nr_zones)
1260                         break;
1261         }
1262
1263         if (hdr) {
1264                 hdr->nr_zones = nr_rep;
1265                 kunmap_atomic(hdr);
1266         }
1267
1268         bio_advance(report_bio, report_bio->bi_iter.bi_size);
1269
1270 #else /* !CONFIG_BLK_DEV_ZONED */
1271         bio->bi_status = BLK_STS_NOTSUPP;
1272 #endif
1273 }
1274 EXPORT_SYMBOL_GPL(dm_remap_zone_report);
1275
1276 static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch)
1277 {
1278         mutex_lock(&md->swap_bios_lock);
1279         while (latch < md->swap_bios) {
1280                 cond_resched();
1281                 down(&md->swap_bios_semaphore);
1282                 md->swap_bios--;
1283         }
1284         while (latch > md->swap_bios) {
1285                 cond_resched();
1286                 up(&md->swap_bios_semaphore);
1287                 md->swap_bios++;
1288         }
1289         mutex_unlock(&md->swap_bios_lock);
1290 }
1291
1292 static blk_qc_t __map_bio(struct dm_target_io *tio)
1293 {
1294         int r;
1295         sector_t sector;
1296         struct bio *clone = &tio->clone;
1297         struct dm_io *io = tio->io;
1298         struct mapped_device *md = io->md;
1299         struct dm_target *ti = tio->ti;
1300         blk_qc_t ret = BLK_QC_T_NONE;
1301
1302         clone->bi_end_io = clone_endio;
1303
1304         /*
1305          * Map the clone.  If r == 0 we don't need to do
1306          * anything, the target has assumed ownership of
1307          * this io.
1308          */
1309         atomic_inc(&io->io_count);
1310         sector = clone->bi_iter.bi_sector;
1311
1312         if (unlikely(swap_bios_limit(ti, clone))) {
1313                 struct mapped_device *md = io->md;
1314                 int latch = get_swap_bios();
1315                 if (unlikely(latch != md->swap_bios))
1316                         __set_swap_bios_limit(md, latch);
1317                 down(&md->swap_bios_semaphore);
1318         }
1319
1320         r = ti->type->map(ti, clone);
1321         switch (r) {
1322         case DM_MAPIO_SUBMITTED:
1323                 break;
1324         case DM_MAPIO_REMAPPED:
1325                 /* the bio has been remapped so dispatch it */
1326                 trace_block_bio_remap(clone->bi_disk->queue, clone,
1327                                       bio_dev(io->orig_bio), sector);
1328                 if (md->type == DM_TYPE_NVME_BIO_BASED)
1329                         ret = direct_make_request(clone);
1330                 else
1331                         ret = generic_make_request(clone);
1332                 break;
1333         case DM_MAPIO_KILL:
1334                 if (unlikely(swap_bios_limit(ti, clone))) {
1335                         struct mapped_device *md = io->md;
1336                         up(&md->swap_bios_semaphore);
1337                 }
1338                 free_tio(tio);
1339                 dec_pending(io, BLK_STS_IOERR);
1340                 break;
1341         case DM_MAPIO_REQUEUE:
1342                 if (unlikely(swap_bios_limit(ti, clone))) {
1343                         struct mapped_device *md = io->md;
1344                         up(&md->swap_bios_semaphore);
1345                 }
1346                 free_tio(tio);
1347                 dec_pending(io, BLK_STS_DM_REQUEUE);
1348                 break;
1349         default:
1350                 DMWARN("unimplemented target map return value: %d", r);
1351                 BUG();
1352         }
1353
1354         return ret;
1355 }
1356
1357 static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
1358 {
1359         bio->bi_iter.bi_sector = sector;
1360         bio->bi_iter.bi_size = to_bytes(len);
1361 }
1362
1363 /*
1364  * Creates a bio that consists of range of complete bvecs.
1365  */
1366 static int clone_bio(struct dm_target_io *tio, struct bio *bio,
1367                      sector_t sector, unsigned len)
1368 {
1369         struct bio *clone = &tio->clone;
1370
1371         __bio_clone_fast(clone, bio);
1372
1373         if (unlikely(bio_integrity(bio) != NULL)) {
1374                 int r;
1375
1376                 if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
1377                              !dm_target_passes_integrity(tio->ti->type))) {
1378                         DMWARN("%s: the target %s doesn't support integrity data.",
1379                                 dm_device_name(tio->io->md),
1380                                 tio->ti->type->name);
1381                         return -EIO;
1382                 }
1383
1384                 r = bio_integrity_clone(clone, bio, GFP_NOIO);
1385                 if (r < 0)
1386                         return r;
1387         }
1388
1389         if (bio_op(bio) != REQ_OP_ZONE_REPORT)
1390                 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1391         clone->bi_iter.bi_size = to_bytes(len);
1392
1393         if (unlikely(bio_integrity(bio) != NULL))
1394                 bio_integrity_trim(clone);
1395
1396         return 0;
1397 }
1398
1399 static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
1400                                 struct dm_target *ti, unsigned num_bios)
1401 {
1402         struct dm_target_io *tio;
1403         int try;
1404
1405         if (!num_bios)
1406                 return;
1407
1408         if (num_bios == 1) {
1409                 tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1410                 bio_list_add(blist, &tio->clone);
1411                 return;
1412         }
1413
1414         for (try = 0; try < 2; try++) {
1415                 int bio_nr;
1416                 struct bio *bio;
1417
1418                 if (try)
1419                         mutex_lock(&ci->io->md->table_devices_lock);
1420                 for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
1421                         tio = alloc_tio(ci, ti, bio_nr, try ? GFP_NOIO : GFP_NOWAIT);
1422                         if (!tio)
1423                                 break;
1424
1425                         bio_list_add(blist, &tio->clone);
1426                 }
1427                 if (try)
1428                         mutex_unlock(&ci->io->md->table_devices_lock);
1429                 if (bio_nr == num_bios)
1430                         return;
1431
1432                 while ((bio = bio_list_pop(blist))) {
1433                         tio = container_of(bio, struct dm_target_io, clone);
1434                         free_tio(tio);
1435                 }
1436         }
1437 }
1438
1439 static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci,
1440                                            struct dm_target_io *tio, unsigned *len)
1441 {
1442         struct bio *clone = &tio->clone;
1443
1444         tio->len_ptr = len;
1445
1446         __bio_clone_fast(clone, ci->bio);
1447         if (len)
1448                 bio_setup_sector(clone, ci->sector, *len);
1449
1450         return __map_bio(tio);
1451 }
1452
1453 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1454                                   unsigned num_bios, unsigned *len)
1455 {
1456         struct bio_list blist = BIO_EMPTY_LIST;
1457         struct bio *bio;
1458         struct dm_target_io *tio;
1459
1460         alloc_multiple_bios(&blist, ci, ti, num_bios);
1461
1462         while ((bio = bio_list_pop(&blist))) {
1463                 tio = container_of(bio, struct dm_target_io, clone);
1464                 (void) __clone_and_map_simple_bio(ci, tio, len);
1465         }
1466 }
1467
1468 static int __send_empty_flush(struct clone_info *ci)
1469 {
1470         unsigned target_nr = 0;
1471         struct dm_target *ti;
1472
1473         BUG_ON(bio_has_data(ci->bio));
1474         while ((ti = dm_table_get_target(ci->map, target_nr++)))
1475                 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1476
1477         return 0;
1478 }
1479
1480 static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1481                                     sector_t sector, unsigned *len)
1482 {
1483         struct bio *bio = ci->bio;
1484         struct dm_target_io *tio;
1485         int r;
1486
1487         tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1488         tio->len_ptr = len;
1489         r = clone_bio(tio, bio, sector, *len);
1490         if (r < 0) {
1491                 free_tio(tio);
1492                 return r;
1493         }
1494         (void) __map_bio(tio);
1495
1496         return 0;
1497 }
1498
1499 typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1500
1501 static unsigned get_num_discard_bios(struct dm_target *ti)
1502 {
1503         return ti->num_discard_bios;
1504 }
1505
1506 static unsigned get_num_secure_erase_bios(struct dm_target *ti)
1507 {
1508         return ti->num_secure_erase_bios;
1509 }
1510
1511 static unsigned get_num_write_same_bios(struct dm_target *ti)
1512 {
1513         return ti->num_write_same_bios;
1514 }
1515
1516 static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
1517 {
1518         return ti->num_write_zeroes_bios;
1519 }
1520
1521 typedef bool (*is_split_required_fn)(struct dm_target *ti);
1522
1523 static bool is_split_required_for_discard(struct dm_target *ti)
1524 {
1525         return ti->split_discard_bios;
1526 }
1527
1528 static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
1529                                        get_num_bios_fn get_num_bios,
1530                                        is_split_required_fn is_split_required)
1531 {
1532         unsigned len;
1533         unsigned num_bios;
1534
1535         /*
1536          * Even though the device advertised support for this type of
1537          * request, that does not mean every target supports it, and
1538          * reconfiguration might also have changed that since the
1539          * check was performed.
1540          */
1541         num_bios = get_num_bios ? get_num_bios(ti) : 0;
1542         if (!num_bios)
1543                 return -EOPNOTSUPP;
1544
1545         if (is_split_required && !is_split_required(ti))
1546                 len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1547         else
1548                 len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
1549
1550         __send_duplicate_bios(ci, ti, num_bios, &len);
1551
1552         ci->sector += len;
1553         ci->sector_count -= len;
1554
1555         return 0;
1556 }
1557
1558 static int __send_discard(struct clone_info *ci, struct dm_target *ti)
1559 {
1560         return __send_changing_extent_only(ci, ti, get_num_discard_bios,
1561                                            is_split_required_for_discard);
1562 }
1563
1564 static int __send_secure_erase(struct clone_info *ci, struct dm_target *ti)
1565 {
1566         return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios, NULL);
1567 }
1568
1569 static int __send_write_same(struct clone_info *ci, struct dm_target *ti)
1570 {
1571         return __send_changing_extent_only(ci, ti, get_num_write_same_bios, NULL);
1572 }
1573
1574 static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti)
1575 {
1576         return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios, NULL);
1577 }
1578
1579 static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
1580                                   int *result)
1581 {
1582         struct bio *bio = ci->bio;
1583
1584         if (bio_op(bio) == REQ_OP_DISCARD)
1585                 *result = __send_discard(ci, ti);
1586         else if (bio_op(bio) == REQ_OP_SECURE_ERASE)
1587                 *result = __send_secure_erase(ci, ti);
1588         else if (bio_op(bio) == REQ_OP_WRITE_SAME)
1589                 *result = __send_write_same(ci, ti);
1590         else if (bio_op(bio) == REQ_OP_WRITE_ZEROES)
1591                 *result = __send_write_zeroes(ci, ti);
1592         else
1593                 return false;
1594
1595         return true;
1596 }
1597
1598 /*
1599  * Select the correct strategy for processing a non-flush bio.
1600  */
1601 static int __split_and_process_non_flush(struct clone_info *ci)
1602 {
1603         struct bio *bio = ci->bio;
1604         struct dm_target *ti;
1605         unsigned len;
1606         int r;
1607
1608         ti = dm_table_find_target(ci->map, ci->sector);
1609         if (!dm_target_is_valid(ti))
1610                 return -EIO;
1611
1612         if (unlikely(__process_abnormal_io(ci, ti, &r)))
1613                 return r;
1614
1615         if (bio_op(bio) == REQ_OP_ZONE_REPORT)
1616                 len = ci->sector_count;
1617         else
1618                 len = min_t(sector_t, max_io_len(ci->sector, ti),
1619                             ci->sector_count);
1620
1621         r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
1622         if (r < 0)
1623                 return r;
1624
1625         ci->sector += len;
1626         ci->sector_count -= len;
1627
1628         return 0;
1629 }
1630
1631 static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
1632                             struct dm_table *map, struct bio *bio)
1633 {
1634         ci->map = map;
1635         ci->io = alloc_io(md, bio);
1636         ci->sector = bio->bi_iter.bi_sector;
1637 }
1638
1639 /*
1640  * Entry point to split a bio into clones and submit them to the targets.
1641  */
1642 static blk_qc_t __split_and_process_bio(struct mapped_device *md,
1643                                         struct dm_table *map, struct bio *bio)
1644 {
1645         struct clone_info ci;
1646         blk_qc_t ret = BLK_QC_T_NONE;
1647         int error = 0;
1648
1649         if (unlikely(!map)) {
1650                 bio_io_error(bio);
1651                 return ret;
1652         }
1653
1654         blk_queue_split(md->queue, &bio);
1655
1656         init_clone_info(&ci, md, map, bio);
1657
1658         if (bio->bi_opf & REQ_PREFLUSH) {
1659                 ci.bio = &ci.io->md->flush_bio;
1660                 ci.sector_count = 0;
1661                 error = __send_empty_flush(&ci);
1662                 /* dec_pending submits any data associated with flush */
1663         } else if (bio_op(bio) == REQ_OP_ZONE_RESET) {
1664                 ci.bio = bio;
1665                 ci.sector_count = 0;
1666                 error = __split_and_process_non_flush(&ci);
1667         } else {
1668                 ci.bio = bio;
1669                 ci.sector_count = bio_sectors(bio);
1670                 while (ci.sector_count && !error) {
1671                         error = __split_and_process_non_flush(&ci);
1672                         if (current->bio_list && ci.sector_count && !error) {
1673                                 /*
1674                                  * Remainder must be passed to generic_make_request()
1675                                  * so that it gets handled *after* bios already submitted
1676                                  * have been completely processed.
1677                                  * We take a clone of the original to store in
1678                                  * ci.io->orig_bio to be used by end_io_acct() and
1679                                  * for dec_pending to use for completion handling.
1680                                  * As this path is not used for REQ_OP_ZONE_REPORT,
1681                                  * the usage of io->orig_bio in dm_remap_zone_report()
1682                                  * won't be affected by this reassignment.
1683                                  */
1684                                 struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
1685                                                           GFP_NOIO, &md->queue->bio_split);
1686                                 ci.io->orig_bio = b;
1687                                 bio_chain(b, bio);
1688                                 ret = generic_make_request(bio);
1689                                 break;
1690                         }
1691                 }
1692         }
1693
1694         /* drop the extra reference count */
1695         dec_pending(ci.io, errno_to_blk_status(error));
1696         return ret;
1697 }
1698
1699 /*
1700  * Optimized variant of __split_and_process_bio that leverages the
1701  * fact that targets that use it do _not_ have a need to split bios.
1702  */
1703 static blk_qc_t __process_bio(struct mapped_device *md,
1704                               struct dm_table *map, struct bio *bio)
1705 {
1706         struct clone_info ci;
1707         blk_qc_t ret = BLK_QC_T_NONE;
1708         int error = 0;
1709
1710         if (unlikely(!map)) {
1711                 bio_io_error(bio);
1712                 return ret;
1713         }
1714
1715         init_clone_info(&ci, md, map, bio);
1716
1717         if (bio->bi_opf & REQ_PREFLUSH) {
1718                 ci.bio = &ci.io->md->flush_bio;
1719                 ci.sector_count = 0;
1720                 error = __send_empty_flush(&ci);
1721                 /* dec_pending submits any data associated with flush */
1722         } else {
1723                 struct dm_target *ti = md->immutable_target;
1724                 struct dm_target_io *tio;
1725
1726                 /*
1727                  * Defend against IO still getting in during teardown
1728                  * - as was seen for a time with nvme-fcloop
1729                  */
1730                 if (unlikely(WARN_ON_ONCE(!ti || !dm_target_is_valid(ti)))) {
1731                         error = -EIO;
1732                         goto out;
1733                 }
1734
1735                 ci.bio = bio;
1736                 ci.sector_count = bio_sectors(bio);
1737                 if (unlikely(__process_abnormal_io(&ci, ti, &error)))
1738                         goto out;
1739
1740                 tio = alloc_tio(&ci, ti, 0, GFP_NOIO);
1741                 ret = __clone_and_map_simple_bio(&ci, tio, NULL);
1742         }
1743 out:
1744         /* drop the extra reference count */
1745         dec_pending(ci.io, errno_to_blk_status(error));
1746         return ret;
1747 }
1748
1749 typedef blk_qc_t (process_bio_fn)(struct mapped_device *, struct dm_table *, struct bio *);
1750
1751 static blk_qc_t __dm_make_request(struct request_queue *q, struct bio *bio,
1752                                   process_bio_fn process_bio)
1753 {
1754         struct mapped_device *md = q->queuedata;
1755         blk_qc_t ret = BLK_QC_T_NONE;
1756         int srcu_idx;
1757         struct dm_table *map;
1758
1759         map = dm_get_live_table(md, &srcu_idx);
1760
1761         /* if we're suspended, we have to queue this io for later */
1762         if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1763                 dm_put_live_table(md, srcu_idx);
1764
1765                 if (!(bio->bi_opf & REQ_RAHEAD))
1766                         queue_io(md, bio);
1767                 else
1768                         bio_io_error(bio);
1769                 return ret;
1770         }
1771
1772         ret = process_bio(md, map, bio);
1773
1774         dm_put_live_table(md, srcu_idx);
1775         return ret;
1776 }
1777
1778 /*
1779  * The request function that remaps the bio to one target and
1780  * splits off any remainder.
1781  */
1782 static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
1783 {
1784         return __dm_make_request(q, bio, __split_and_process_bio);
1785 }
1786
1787 static blk_qc_t dm_make_request_nvme(struct request_queue *q, struct bio *bio)
1788 {
1789         return __dm_make_request(q, bio, __process_bio);
1790 }
1791
1792 static int dm_any_congested(void *congested_data, int bdi_bits)
1793 {
1794         int r = bdi_bits;
1795         struct mapped_device *md = congested_data;
1796         struct dm_table *map;
1797
1798         if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1799                 if (dm_request_based(md)) {
1800                         /*
1801                          * With request-based DM we only need to check the
1802                          * top-level queue for congestion.
1803                          */
1804                         r = md->queue->backing_dev_info->wb.state & bdi_bits;
1805                 } else {
1806                         map = dm_get_live_table_fast(md);
1807                         if (map)
1808                                 r = dm_table_any_congested(map, bdi_bits);
1809                         dm_put_live_table_fast(md);
1810                 }
1811         }
1812
1813         return r;
1814 }
1815
1816 /*-----------------------------------------------------------------
1817  * An IDR is used to keep track of allocated minor numbers.
1818  *---------------------------------------------------------------*/
1819 static void free_minor(int minor)
1820 {
1821         spin_lock(&_minor_lock);
1822         idr_remove(&_minor_idr, minor);
1823         spin_unlock(&_minor_lock);
1824 }
1825
1826 /*
1827  * See if the device with a specific minor # is free.
1828  */
1829 static int specific_minor(int minor)
1830 {
1831         int r;
1832
1833         if (minor >= (1 << MINORBITS))
1834                 return -EINVAL;
1835
1836         idr_preload(GFP_KERNEL);
1837         spin_lock(&_minor_lock);
1838
1839         r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1840
1841         spin_unlock(&_minor_lock);
1842         idr_preload_end();
1843         if (r < 0)
1844                 return r == -ENOSPC ? -EBUSY : r;
1845         return 0;
1846 }
1847
1848 static int next_free_minor(int *minor)
1849 {
1850         int r;
1851
1852         idr_preload(GFP_KERNEL);
1853         spin_lock(&_minor_lock);
1854
1855         r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1856
1857         spin_unlock(&_minor_lock);
1858         idr_preload_end();
1859         if (r < 0)
1860                 return r;
1861         *minor = r;
1862         return 0;
1863 }
1864
1865 static const struct block_device_operations dm_blk_dops;
1866 static const struct dax_operations dm_dax_ops;
1867
1868 static void dm_wq_work(struct work_struct *work);
1869
1870 static void dm_init_normal_md_queue(struct mapped_device *md)
1871 {
1872         md->use_blk_mq = false;
1873
1874         /*
1875          * Initialize aspects of queue that aren't relevant for blk-mq
1876          */
1877         md->queue->backing_dev_info->congested_data = md;
1878         md->queue->backing_dev_info->congested_fn = dm_any_congested;
1879 }
1880
1881 static void cleanup_mapped_device(struct mapped_device *md)
1882 {
1883         if (md->wq)
1884                 destroy_workqueue(md->wq);
1885         if (md->kworker_task)
1886                 kthread_stop(md->kworker_task);
1887         bioset_exit(&md->bs);
1888         bioset_exit(&md->io_bs);
1889
1890         if (md->dax_dev) {
1891                 kill_dax(md->dax_dev);
1892                 put_dax(md->dax_dev);
1893                 md->dax_dev = NULL;
1894         }
1895
1896         if (md->disk) {
1897                 spin_lock(&_minor_lock);
1898                 md->disk->private_data = NULL;
1899                 spin_unlock(&_minor_lock);
1900                 del_gendisk(md->disk);
1901                 put_disk(md->disk);
1902         }
1903
1904         if (md->queue)
1905                 blk_cleanup_queue(md->queue);
1906
1907         cleanup_srcu_struct(&md->io_barrier);
1908
1909         if (md->bdev) {
1910                 bdput(md->bdev);
1911                 md->bdev = NULL;
1912         }
1913
1914         mutex_destroy(&md->suspend_lock);
1915         mutex_destroy(&md->type_lock);
1916         mutex_destroy(&md->table_devices_lock);
1917         mutex_destroy(&md->swap_bios_lock);
1918
1919         dm_mq_cleanup_mapped_device(md);
1920 }
1921
1922 /*
1923  * Allocate and initialise a blank device with a given minor.
1924  */
1925 static struct mapped_device *alloc_dev(int minor)
1926 {
1927         int r, numa_node_id = dm_get_numa_node();
1928         struct dax_device *dax_dev = NULL;
1929         struct mapped_device *md;
1930         void *old_md;
1931
1932         md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1933         if (!md) {
1934                 DMWARN("unable to allocate device, out of memory.");
1935                 return NULL;
1936         }
1937
1938         if (!try_module_get(THIS_MODULE))
1939                 goto bad_module_get;
1940
1941         /* get a minor number for the dev */
1942         if (minor == DM_ANY_MINOR)
1943                 r = next_free_minor(&minor);
1944         else
1945                 r = specific_minor(minor);
1946         if (r < 0)
1947                 goto bad_minor;
1948
1949         r = init_srcu_struct(&md->io_barrier);
1950         if (r < 0)
1951                 goto bad_io_barrier;
1952
1953         md->numa_node_id = numa_node_id;
1954         md->use_blk_mq = dm_use_blk_mq_default();
1955         md->init_tio_pdu = false;
1956         md->type = DM_TYPE_NONE;
1957         mutex_init(&md->suspend_lock);
1958         mutex_init(&md->type_lock);
1959         mutex_init(&md->table_devices_lock);
1960         spin_lock_init(&md->deferred_lock);
1961         atomic_set(&md->holders, 1);
1962         atomic_set(&md->open_count, 0);
1963         atomic_set(&md->event_nr, 0);
1964         atomic_set(&md->uevent_seq, 0);
1965         INIT_LIST_HEAD(&md->uevent_list);
1966         INIT_LIST_HEAD(&md->table_devices);
1967         spin_lock_init(&md->uevent_lock);
1968
1969         md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id, NULL);
1970         if (!md->queue)
1971                 goto bad;
1972         md->queue->queuedata = md;
1973         /*
1974          * default to bio-based required ->make_request_fn until DM
1975          * table is loaded and md->type established. If request-based
1976          * table is loaded: blk-mq will override accordingly.
1977          */
1978         blk_queue_make_request(md->queue, dm_make_request);
1979
1980         md->disk = alloc_disk_node(1, md->numa_node_id);
1981         if (!md->disk)
1982                 goto bad;
1983
1984         atomic_set(&md->pending[0], 0);
1985         atomic_set(&md->pending[1], 0);
1986         init_waitqueue_head(&md->wait);
1987         INIT_WORK(&md->work, dm_wq_work);
1988         init_waitqueue_head(&md->eventq);
1989         init_completion(&md->kobj_holder.completion);
1990         md->kworker_task = NULL;
1991
1992         md->swap_bios = get_swap_bios();
1993         sema_init(&md->swap_bios_semaphore, md->swap_bios);
1994         mutex_init(&md->swap_bios_lock);
1995
1996         md->disk->major = _major;
1997         md->disk->first_minor = minor;
1998         md->disk->fops = &dm_blk_dops;
1999         md->disk->queue = md->queue;
2000         md->disk->private_data = md;
2001         sprintf(md->disk->disk_name, "dm-%d", minor);
2002
2003         if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
2004                 dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
2005                 if (!dax_dev)
2006                         goto bad;
2007         }
2008         md->dax_dev = dax_dev;
2009
2010         add_disk_no_queue_reg(md->disk);
2011         format_dev_t(md->name, MKDEV(_major, minor));
2012
2013         md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
2014         if (!md->wq)
2015                 goto bad;
2016
2017         md->bdev = bdget_disk(md->disk, 0);
2018         if (!md->bdev)
2019                 goto bad;
2020
2021         bio_init(&md->flush_bio, NULL, 0);
2022         bio_set_dev(&md->flush_bio, md->bdev);
2023         md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
2024
2025         dm_stats_init(&md->stats);
2026
2027         /* Populate the mapping, nobody knows we exist yet */
2028         spin_lock(&_minor_lock);
2029         old_md = idr_replace(&_minor_idr, md, minor);
2030         spin_unlock(&_minor_lock);
2031
2032         BUG_ON(old_md != MINOR_ALLOCED);
2033
2034         return md;
2035
2036 bad:
2037         cleanup_mapped_device(md);
2038 bad_io_barrier:
2039         free_minor(minor);
2040 bad_minor:
2041         module_put(THIS_MODULE);
2042 bad_module_get:
2043         kvfree(md);
2044         return NULL;
2045 }
2046
2047 static void unlock_fs(struct mapped_device *md);
2048
2049 static void free_dev(struct mapped_device *md)
2050 {
2051         int minor = MINOR(disk_devt(md->disk));
2052
2053         unlock_fs(md);
2054
2055         cleanup_mapped_device(md);
2056
2057         free_table_devices(&md->table_devices);
2058         dm_stats_cleanup(&md->stats);
2059         free_minor(minor);
2060
2061         module_put(THIS_MODULE);
2062         kvfree(md);
2063 }
2064
2065 static int __bind_mempools(struct mapped_device *md, struct dm_table *t)
2066 {
2067         struct dm_md_mempools *p = dm_table_get_md_mempools(t);
2068         int ret = 0;
2069
2070         if (dm_table_bio_based(t)) {
2071                 /*
2072                  * The md may already have mempools that need changing.
2073                  * If so, reload bioset because front_pad may have changed
2074                  * because a different table was loaded.
2075                  */
2076                 bioset_exit(&md->bs);
2077                 bioset_exit(&md->io_bs);
2078
2079         } else if (bioset_initialized(&md->bs)) {
2080                 /*
2081                  * There's no need to reload with request-based dm
2082                  * because the size of front_pad doesn't change.
2083                  * Note for future: If you are to reload bioset,
2084                  * prep-ed requests in the queue may refer
2085                  * to bio from the old bioset, so you must walk
2086                  * through the queue to unprep.
2087                  */
2088                 goto out;
2089         }
2090
2091         BUG_ON(!p ||
2092                bioset_initialized(&md->bs) ||
2093                bioset_initialized(&md->io_bs));
2094
2095         ret = bioset_init_from_src(&md->bs, &p->bs);
2096         if (ret)
2097                 goto out;
2098         ret = bioset_init_from_src(&md->io_bs, &p->io_bs);
2099         if (ret)
2100                 bioset_exit(&md->bs);
2101 out:
2102         /* mempool bind completed, no longer need any mempools in the table */
2103         dm_table_free_md_mempools(t);
2104         return ret;
2105 }
2106
2107 /*
2108  * Bind a table to the device.
2109  */
2110 static void event_callback(void *context)
2111 {
2112         unsigned long flags;
2113         LIST_HEAD(uevents);
2114         struct mapped_device *md = (struct mapped_device *) context;
2115
2116         spin_lock_irqsave(&md->uevent_lock, flags);
2117         list_splice_init(&md->uevent_list, &uevents);
2118         spin_unlock_irqrestore(&md->uevent_lock, flags);
2119
2120         dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
2121
2122         atomic_inc(&md->event_nr);
2123         wake_up(&md->eventq);
2124         dm_issue_global_event();
2125 }
2126
2127 /*
2128  * Protected by md->suspend_lock obtained by dm_swap_table().
2129  */
2130 static void __set_size(struct mapped_device *md, sector_t size)
2131 {
2132         lockdep_assert_held(&md->suspend_lock);
2133
2134         set_capacity(md->disk, size);
2135
2136         i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
2137 }
2138
2139 /*
2140  * Returns old map, which caller must destroy.
2141  */
2142 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2143                                struct queue_limits *limits)
2144 {
2145         struct dm_table *old_map;
2146         struct request_queue *q = md->queue;
2147         bool request_based = dm_table_request_based(t);
2148         sector_t size;
2149         int ret;
2150
2151         lockdep_assert_held(&md->suspend_lock);
2152
2153         size = dm_table_get_size(t);
2154
2155         /*
2156          * Wipe any geometry if the size of the table changed.
2157          */
2158         if (size != dm_get_size(md))
2159                 memset(&md->geometry, 0, sizeof(md->geometry));
2160
2161         __set_size(md, size);
2162
2163         dm_table_event_callback(t, event_callback, md);
2164
2165         /*
2166          * The queue hasn't been stopped yet, if the old table type wasn't
2167          * for request-based during suspension.  So stop it to prevent
2168          * I/O mapping before resume.
2169          * This must be done before setting the queue restrictions,
2170          * because request-based dm may be run just after the setting.
2171          */
2172         if (request_based)
2173                 dm_stop_queue(q);
2174
2175         if (request_based || md->type == DM_TYPE_NVME_BIO_BASED) {
2176                 /*
2177                  * Leverage the fact that request-based DM targets and
2178                  * NVMe bio based targets are immutable singletons
2179                  * - used to optimize both dm_request_fn and dm_mq_queue_rq;
2180                  *   and __process_bio.
2181                  */
2182                 md->immutable_target = dm_table_get_immutable_target(t);
2183         }
2184
2185         ret = __bind_mempools(md, t);
2186         if (ret) {
2187                 old_map = ERR_PTR(ret);
2188                 goto out;
2189         }
2190
2191         old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2192         rcu_assign_pointer(md->map, (void *)t);
2193         md->immutable_target_type = dm_table_get_immutable_target_type(t);
2194
2195         dm_table_set_restrictions(t, q, limits);
2196         if (old_map)
2197                 dm_sync_table(md);
2198
2199 out:
2200         return old_map;
2201 }
2202
2203 /*
2204  * Returns unbound table for the caller to free.
2205  */
2206 static struct dm_table *__unbind(struct mapped_device *md)
2207 {
2208         struct dm_table *map = rcu_dereference_protected(md->map, 1);
2209
2210         if (!map)
2211                 return NULL;
2212
2213         dm_table_event_callback(map, NULL, NULL);
2214         RCU_INIT_POINTER(md->map, NULL);
2215         dm_sync_table(md);
2216
2217         return map;
2218 }
2219
2220 /*
2221  * Constructor for a new device.
2222  */
2223 int dm_create(int minor, struct mapped_device **result)
2224 {
2225         int r;
2226         struct mapped_device *md;
2227
2228         md = alloc_dev(minor);
2229         if (!md)
2230                 return -ENXIO;
2231
2232         r = dm_sysfs_init(md);
2233         if (r) {
2234                 free_dev(md);
2235                 return r;
2236         }
2237
2238         *result = md;
2239         return 0;
2240 }
2241
2242 /*
2243  * Functions to manage md->type.
2244  * All are required to hold md->type_lock.
2245  */
2246 void dm_lock_md_type(struct mapped_device *md)
2247 {
2248         mutex_lock(&md->type_lock);
2249 }
2250
2251 void dm_unlock_md_type(struct mapped_device *md)
2252 {
2253         mutex_unlock(&md->type_lock);
2254 }
2255
2256 void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
2257 {
2258         BUG_ON(!mutex_is_locked(&md->type_lock));
2259         md->type = type;
2260 }
2261
2262 enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
2263 {
2264         return md->type;
2265 }
2266
2267 struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2268 {
2269         return md->immutable_target_type;
2270 }
2271
2272 /*
2273  * The queue_limits are only valid as long as you have a reference
2274  * count on 'md'.
2275  */
2276 struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2277 {
2278         BUG_ON(!atomic_read(&md->holders));
2279         return &md->queue->limits;
2280 }
2281 EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2282
2283 /*
2284  * Setup the DM device's queue based on md's type
2285  */
2286 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2287 {
2288         int r;
2289         struct queue_limits limits;
2290         enum dm_queue_mode type = dm_get_md_type(md);
2291
2292         switch (type) {
2293         case DM_TYPE_REQUEST_BASED:
2294                 dm_init_normal_md_queue(md);
2295                 r = dm_old_init_request_queue(md, t);
2296                 if (r) {
2297                         DMERR("Cannot initialize queue for request-based mapped device");
2298                         return r;
2299                 }
2300                 break;
2301         case DM_TYPE_MQ_REQUEST_BASED:
2302                 r = dm_mq_init_request_queue(md, t);
2303                 if (r) {
2304                         DMERR("Cannot initialize queue for request-based dm-mq mapped device");
2305                         return r;
2306                 }
2307                 break;
2308         case DM_TYPE_BIO_BASED:
2309         case DM_TYPE_DAX_BIO_BASED:
2310                 dm_init_normal_md_queue(md);
2311                 break;
2312         case DM_TYPE_NVME_BIO_BASED:
2313                 dm_init_normal_md_queue(md);
2314                 blk_queue_make_request(md->queue, dm_make_request_nvme);
2315                 break;
2316         case DM_TYPE_NONE:
2317                 WARN_ON_ONCE(true);
2318                 break;
2319         }
2320
2321         r = dm_calculate_queue_limits(t, &limits);
2322         if (r) {
2323                 DMERR("Cannot calculate initial queue limits");
2324                 return r;
2325         }
2326         dm_table_set_restrictions(t, md->queue, &limits);
2327         blk_register_queue(md->disk);
2328
2329         return 0;
2330 }
2331
2332 struct mapped_device *dm_get_md(dev_t dev)
2333 {
2334         struct mapped_device *md;
2335         unsigned minor = MINOR(dev);
2336
2337         if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2338                 return NULL;
2339
2340         spin_lock(&_minor_lock);
2341
2342         md = idr_find(&_minor_idr, minor);
2343         if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
2344             test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2345                 md = NULL;
2346                 goto out;
2347         }
2348         dm_get(md);
2349 out:
2350         spin_unlock(&_minor_lock);
2351
2352         return md;
2353 }
2354 EXPORT_SYMBOL_GPL(dm_get_md);
2355
2356 void *dm_get_mdptr(struct mapped_device *md)
2357 {
2358         return md->interface_ptr;
2359 }
2360
2361 void dm_set_mdptr(struct mapped_device *md, void *ptr)
2362 {
2363         md->interface_ptr = ptr;
2364 }
2365
2366 void dm_get(struct mapped_device *md)
2367 {
2368         atomic_inc(&md->holders);
2369         BUG_ON(test_bit(DMF_FREEING, &md->flags));
2370 }
2371
2372 int dm_hold(struct mapped_device *md)
2373 {
2374         spin_lock(&_minor_lock);
2375         if (test_bit(DMF_FREEING, &md->flags)) {
2376                 spin_unlock(&_minor_lock);
2377                 return -EBUSY;
2378         }
2379         dm_get(md);
2380         spin_unlock(&_minor_lock);
2381         return 0;
2382 }
2383 EXPORT_SYMBOL_GPL(dm_hold);
2384
2385 const char *dm_device_name(struct mapped_device *md)
2386 {
2387         return md->name;
2388 }
2389 EXPORT_SYMBOL_GPL(dm_device_name);
2390
2391 static void __dm_destroy(struct mapped_device *md, bool wait)
2392 {
2393         struct dm_table *map;
2394         int srcu_idx;
2395
2396         might_sleep();
2397
2398         spin_lock(&_minor_lock);
2399         idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2400         set_bit(DMF_FREEING, &md->flags);
2401         spin_unlock(&_minor_lock);
2402
2403         blk_set_queue_dying(md->queue);
2404
2405         if (dm_request_based(md) && md->kworker_task)
2406                 kthread_flush_worker(&md->kworker);
2407
2408         /*
2409          * Take suspend_lock so that presuspend and postsuspend methods
2410          * do not race with internal suspend.
2411          */
2412         mutex_lock(&md->suspend_lock);
2413         map = dm_get_live_table(md, &srcu_idx);
2414         if (!dm_suspended_md(md)) {
2415                 dm_table_presuspend_targets(map);
2416                 set_bit(DMF_SUSPENDED, &md->flags);
2417                 set_bit(DMF_POST_SUSPENDING, &md->flags);
2418                 dm_table_postsuspend_targets(map);
2419         }
2420         /* dm_put_live_table must be before msleep, otherwise deadlock is possible */
2421         dm_put_live_table(md, srcu_idx);
2422         mutex_unlock(&md->suspend_lock);
2423
2424         /*
2425          * Rare, but there may be I/O requests still going to complete,
2426          * for example.  Wait for all references to disappear.
2427          * No one should increment the reference count of the mapped_device,
2428          * after the mapped_device state becomes DMF_FREEING.
2429          */
2430         if (wait)
2431                 while (atomic_read(&md->holders))
2432                         msleep(1);
2433         else if (atomic_read(&md->holders))
2434                 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2435                        dm_device_name(md), atomic_read(&md->holders));
2436
2437         dm_sysfs_exit(md);
2438         dm_table_destroy(__unbind(md));
2439         free_dev(md);
2440 }
2441
2442 void dm_destroy(struct mapped_device *md)
2443 {
2444         __dm_destroy(md, true);
2445 }
2446
2447 void dm_destroy_immediate(struct mapped_device *md)
2448 {
2449         __dm_destroy(md, false);
2450 }
2451
2452 void dm_put(struct mapped_device *md)
2453 {
2454         atomic_dec(&md->holders);
2455 }
2456 EXPORT_SYMBOL_GPL(dm_put);
2457
2458 static int dm_wait_for_completion(struct mapped_device *md, long task_state)
2459 {
2460         int r = 0;
2461         DEFINE_WAIT(wait);
2462
2463         while (1) {
2464                 prepare_to_wait(&md->wait, &wait, task_state);
2465
2466                 if (!md_in_flight(md))
2467                         break;
2468
2469                 if (signal_pending_state(task_state, current)) {
2470                         r = -EINTR;
2471                         break;
2472                 }
2473
2474                 io_schedule();
2475         }
2476         finish_wait(&md->wait, &wait);
2477
2478         smp_rmb(); /* paired with atomic_dec_return in end_io_acct */
2479
2480         return r;
2481 }
2482
2483 /*
2484  * Process the deferred bios
2485  */
2486 static void dm_wq_work(struct work_struct *work)
2487 {
2488         struct mapped_device *md = container_of(work, struct mapped_device,
2489                                                 work);
2490         struct bio *c;
2491         int srcu_idx;
2492         struct dm_table *map;
2493
2494         map = dm_get_live_table(md, &srcu_idx);
2495
2496         while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2497                 spin_lock_irq(&md->deferred_lock);
2498                 c = bio_list_pop(&md->deferred);
2499                 spin_unlock_irq(&md->deferred_lock);
2500
2501                 if (!c)
2502                         break;
2503
2504                 if (dm_request_based(md))
2505                         generic_make_request(c);
2506                 else
2507                         __split_and_process_bio(md, map, c);
2508         }
2509
2510         dm_put_live_table(md, srcu_idx);
2511 }
2512
2513 static void dm_queue_flush(struct mapped_device *md)
2514 {
2515         clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2516         smp_mb__after_atomic();
2517         queue_work(md->wq, &md->work);
2518 }
2519
2520 /*
2521  * Swap in a new table, returning the old one for the caller to destroy.
2522  */
2523 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2524 {
2525         struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2526         struct queue_limits limits;
2527         int r;
2528
2529         mutex_lock(&md->suspend_lock);
2530
2531         /* device must be suspended */
2532         if (!dm_suspended_md(md))
2533                 goto out;
2534
2535         /*
2536          * If the new table has no data devices, retain the existing limits.
2537          * This helps multipath with queue_if_no_path if all paths disappear,
2538          * then new I/O is queued based on these limits, and then some paths
2539          * reappear.
2540          */
2541         if (dm_table_has_no_data_devices(table)) {
2542                 live_map = dm_get_live_table_fast(md);
2543                 if (live_map)
2544                         limits = md->queue->limits;
2545                 dm_put_live_table_fast(md);
2546         }
2547
2548         if (!live_map) {
2549                 r = dm_calculate_queue_limits(table, &limits);
2550                 if (r) {
2551                         map = ERR_PTR(r);
2552                         goto out;
2553                 }
2554         }
2555
2556         map = __bind(md, table, &limits);
2557         dm_issue_global_event();
2558
2559 out:
2560         mutex_unlock(&md->suspend_lock);
2561         return map;
2562 }
2563
2564 /*
2565  * Functions to lock and unlock any filesystem running on the
2566  * device.
2567  */
2568 static int lock_fs(struct mapped_device *md)
2569 {
2570         int r;
2571
2572         WARN_ON(md->frozen_sb);
2573
2574         md->frozen_sb = freeze_bdev(md->bdev);
2575         if (IS_ERR(md->frozen_sb)) {
2576                 r = PTR_ERR(md->frozen_sb);
2577                 md->frozen_sb = NULL;
2578                 return r;
2579         }
2580
2581         set_bit(DMF_FROZEN, &md->flags);
2582
2583         return 0;
2584 }
2585
2586 static void unlock_fs(struct mapped_device *md)
2587 {
2588         if (!test_bit(DMF_FROZEN, &md->flags))
2589                 return;
2590
2591         thaw_bdev(md->bdev, md->frozen_sb);
2592         md->frozen_sb = NULL;
2593         clear_bit(DMF_FROZEN, &md->flags);
2594 }
2595
2596 /*
2597  * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG
2598  * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE
2599  * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY
2600  *
2601  * If __dm_suspend returns 0, the device is completely quiescent
2602  * now. There is no request-processing activity. All new requests
2603  * are being added to md->deferred list.
2604  */
2605 static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2606                         unsigned suspend_flags, long task_state,
2607                         int dmf_suspended_flag)
2608 {
2609         bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2610         bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2611         int r;
2612
2613         lockdep_assert_held(&md->suspend_lock);
2614
2615         /*
2616          * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2617          * This flag is cleared before dm_suspend returns.
2618          */
2619         if (noflush)
2620                 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2621         else
2622                 pr_debug("%s: suspending with flush\n", dm_device_name(md));
2623
2624         /*
2625          * This gets reverted if there's an error later and the targets
2626          * provide the .presuspend_undo hook.
2627          */
2628         dm_table_presuspend_targets(map);
2629
2630         /*
2631          * Flush I/O to the device.
2632          * Any I/O submitted after lock_fs() may not be flushed.
2633          * noflush takes precedence over do_lockfs.
2634          * (lock_fs() flushes I/Os and waits for them to complete.)
2635          */
2636         if (!noflush && do_lockfs) {
2637                 r = lock_fs(md);
2638                 if (r) {
2639                         dm_table_presuspend_undo_targets(map);
2640                         return r;
2641                 }
2642         }
2643
2644         /*
2645          * Here we must make sure that no processes are submitting requests
2646          * to target drivers i.e. no one may be executing
2647          * __split_and_process_bio. This is called from dm_request and
2648          * dm_wq_work.
2649          *
2650          * To get all processes out of __split_and_process_bio in dm_request,
2651          * we take the write lock. To prevent any process from reentering
2652          * __split_and_process_bio from dm_request and quiesce the thread
2653          * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
2654          * flush_workqueue(md->wq).
2655          */
2656         set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2657         if (map)
2658                 synchronize_srcu(&md->io_barrier);
2659
2660         /*
2661          * Stop md->queue before flushing md->wq in case request-based
2662          * dm defers requests to md->wq from md->queue.
2663          */
2664         if (dm_request_based(md)) {
2665                 dm_stop_queue(md->queue);
2666                 if (md->kworker_task)
2667                         kthread_flush_worker(&md->kworker);
2668         }
2669
2670         flush_workqueue(md->wq);
2671
2672         /*
2673          * At this point no more requests are entering target request routines.
2674          * We call dm_wait_for_completion to wait for all existing requests
2675          * to finish.
2676          */
2677         r = dm_wait_for_completion(md, task_state);
2678         if (!r)
2679                 set_bit(dmf_suspended_flag, &md->flags);
2680
2681         if (noflush)
2682                 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2683         if (map)
2684                 synchronize_srcu(&md->io_barrier);
2685
2686         /* were we interrupted ? */
2687         if (r < 0) {
2688                 dm_queue_flush(md);
2689
2690                 if (dm_request_based(md))
2691                         dm_start_queue(md->queue);
2692
2693                 unlock_fs(md);
2694                 dm_table_presuspend_undo_targets(map);
2695                 /* pushback list is already flushed, so skip flush */
2696         }
2697
2698         return r;
2699 }
2700
2701 /*
2702  * We need to be able to change a mapping table under a mounted
2703  * filesystem.  For example we might want to move some data in
2704  * the background.  Before the table can be swapped with
2705  * dm_bind_table, dm_suspend must be called to flush any in
2706  * flight bios and ensure that any further io gets deferred.
2707  */
2708 /*
2709  * Suspend mechanism in request-based dm.
2710  *
2711  * 1. Flush all I/Os by lock_fs() if needed.
2712  * 2. Stop dispatching any I/O by stopping the request_queue.
2713  * 3. Wait for all in-flight I/Os to be completed or requeued.
2714  *
2715  * To abort suspend, start the request_queue.
2716  */
2717 int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2718 {
2719         struct dm_table *map = NULL;
2720         int r = 0;
2721
2722 retry:
2723         mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2724
2725         if (dm_suspended_md(md)) {
2726                 r = -EINVAL;
2727                 goto out_unlock;
2728         }
2729
2730         if (dm_suspended_internally_md(md)) {
2731                 /* already internally suspended, wait for internal resume */
2732                 mutex_unlock(&md->suspend_lock);
2733                 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2734                 if (r)
2735                         return r;
2736                 goto retry;
2737         }
2738
2739         map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2740
2741         r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
2742         if (r)
2743                 goto out_unlock;
2744
2745         set_bit(DMF_POST_SUSPENDING, &md->flags);
2746         dm_table_postsuspend_targets(map);
2747         clear_bit(DMF_POST_SUSPENDING, &md->flags);
2748
2749 out_unlock:
2750         mutex_unlock(&md->suspend_lock);
2751         return r;
2752 }
2753
2754 static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2755 {
2756         if (map) {
2757                 int r = dm_table_resume_targets(map);
2758                 if (r)
2759                         return r;
2760         }
2761
2762         dm_queue_flush(md);
2763
2764         /*
2765          * Flushing deferred I/Os must be done after targets are resumed
2766          * so that mapping of targets can work correctly.
2767          * Request-based dm is queueing the deferred I/Os in its request_queue.
2768          */
2769         if (dm_request_based(md))
2770                 dm_start_queue(md->queue);
2771
2772         unlock_fs(md);
2773
2774         return 0;
2775 }
2776
2777 int dm_resume(struct mapped_device *md)
2778 {
2779         int r;
2780         struct dm_table *map = NULL;
2781
2782 retry:
2783         r = -EINVAL;
2784         mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2785
2786         if (!dm_suspended_md(md))
2787                 goto out;
2788
2789         if (dm_suspended_internally_md(md)) {
2790                 /* already internally suspended, wait for internal resume */
2791                 mutex_unlock(&md->suspend_lock);
2792                 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2793                 if (r)
2794                         return r;
2795                 goto retry;
2796         }
2797
2798         map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2799         if (!map || !dm_table_get_size(map))
2800                 goto out;
2801
2802         r = __dm_resume(md, map);
2803         if (r)
2804                 goto out;
2805
2806         clear_bit(DMF_SUSPENDED, &md->flags);
2807 out:
2808         mutex_unlock(&md->suspend_lock);
2809
2810         return r;
2811 }
2812
2813 /*
2814  * Internal suspend/resume works like userspace-driven suspend. It waits
2815  * until all bios finish and prevents issuing new bios to the target drivers.
2816  * It may be used only from the kernel.
2817  */
2818
2819 static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2820 {
2821         struct dm_table *map = NULL;
2822
2823         lockdep_assert_held(&md->suspend_lock);
2824
2825         if (md->internal_suspend_count++)
2826                 return; /* nested internal suspend */
2827
2828         if (dm_suspended_md(md)) {
2829                 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2830                 return; /* nest suspend */
2831         }
2832
2833         map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2834
2835         /*
2836          * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
2837          * supported.  Properly supporting a TASK_INTERRUPTIBLE internal suspend
2838          * would require changing .presuspend to return an error -- avoid this
2839          * until there is a need for more elaborate variants of internal suspend.
2840          */
2841         (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2842                             DMF_SUSPENDED_INTERNALLY);
2843
2844         set_bit(DMF_POST_SUSPENDING, &md->flags);
2845         dm_table_postsuspend_targets(map);
2846         clear_bit(DMF_POST_SUSPENDING, &md->flags);
2847 }
2848
2849 static void __dm_internal_resume(struct mapped_device *md)
2850 {
2851         BUG_ON(!md->internal_suspend_count);
2852
2853         if (--md->internal_suspend_count)
2854                 return; /* resume from nested internal suspend */
2855
2856         if (dm_suspended_md(md))
2857                 goto done; /* resume from nested suspend */
2858
2859         /*
2860          * NOTE: existing callers don't need to call dm_table_resume_targets
2861          * (which may fail -- so best to avoid it for now by passing NULL map)
2862          */
2863         (void) __dm_resume(md, NULL);
2864
2865 done:
2866         clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2867         smp_mb__after_atomic();
2868         wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2869 }
2870
2871 void dm_internal_suspend_noflush(struct mapped_device *md)
2872 {
2873         mutex_lock(&md->suspend_lock);
2874         __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2875         mutex_unlock(&md->suspend_lock);
2876 }
2877 EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2878
2879 void dm_internal_resume(struct mapped_device *md)
2880 {
2881         mutex_lock(&md->suspend_lock);
2882         __dm_internal_resume(md);
2883         mutex_unlock(&md->suspend_lock);
2884 }
2885 EXPORT_SYMBOL_GPL(dm_internal_resume);
2886
2887 /*
2888  * Fast variants of internal suspend/resume hold md->suspend_lock,
2889  * which prevents interaction with userspace-driven suspend.
2890  */
2891
2892 void dm_internal_suspend_fast(struct mapped_device *md)
2893 {
2894         mutex_lock(&md->suspend_lock);
2895         if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2896                 return;
2897
2898         set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2899         synchronize_srcu(&md->io_barrier);
2900         flush_workqueue(md->wq);
2901         dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2902 }
2903 EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
2904
2905 void dm_internal_resume_fast(struct mapped_device *md)
2906 {
2907         if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2908                 goto done;
2909
2910         dm_queue_flush(md);
2911
2912 done:
2913         mutex_unlock(&md->suspend_lock);
2914 }
2915 EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
2916
2917 /*-----------------------------------------------------------------
2918  * Event notification.
2919  *---------------------------------------------------------------*/
2920 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2921                        unsigned cookie)
2922 {
2923         int r;
2924         unsigned noio_flag;
2925         char udev_cookie[DM_COOKIE_LENGTH];
2926         char *envp[] = { udev_cookie, NULL };
2927
2928         noio_flag = memalloc_noio_save();
2929
2930         if (!cookie)
2931                 r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2932         else {
2933                 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2934                          DM_COOKIE_ENV_VAR_NAME, cookie);
2935                 r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2936                                        action, envp);
2937         }
2938
2939         memalloc_noio_restore(noio_flag);
2940
2941         return r;
2942 }
2943
2944 uint32_t dm_next_uevent_seq(struct mapped_device *md)
2945 {
2946         return atomic_add_return(1, &md->uevent_seq);
2947 }
2948
2949 uint32_t dm_get_event_nr(struct mapped_device *md)
2950 {
2951         return atomic_read(&md->event_nr);
2952 }
2953
2954 int dm_wait_event(struct mapped_device *md, int event_nr)
2955 {
2956         return wait_event_interruptible(md->eventq,
2957                         (event_nr != atomic_read(&md->event_nr)));
2958 }
2959
2960 void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2961 {
2962         unsigned long flags;
2963
2964         spin_lock_irqsave(&md->uevent_lock, flags);
2965         list_add(elist, &md->uevent_list);
2966         spin_unlock_irqrestore(&md->uevent_lock, flags);
2967 }
2968
2969 /*
2970  * The gendisk is only valid as long as you have a reference
2971  * count on 'md'.
2972  */
2973 struct gendisk *dm_disk(struct mapped_device *md)
2974 {
2975         return md->disk;
2976 }
2977 EXPORT_SYMBOL_GPL(dm_disk);
2978
2979 struct kobject *dm_kobject(struct mapped_device *md)
2980 {
2981         return &md->kobj_holder.kobj;
2982 }
2983
2984 struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2985 {
2986         struct mapped_device *md;
2987
2988         md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2989
2990         spin_lock(&_minor_lock);
2991         if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2992                 md = NULL;
2993                 goto out;
2994         }
2995         dm_get(md);
2996 out:
2997         spin_unlock(&_minor_lock);
2998
2999         return md;
3000 }
3001
3002 int dm_suspended_md(struct mapped_device *md)
3003 {
3004         return test_bit(DMF_SUSPENDED, &md->flags);
3005 }
3006
3007 static int dm_post_suspending_md(struct mapped_device *md)
3008 {
3009         return test_bit(DMF_POST_SUSPENDING, &md->flags);
3010 }
3011
3012 int dm_suspended_internally_md(struct mapped_device *md)
3013 {
3014         return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
3015 }
3016
3017 int dm_test_deferred_remove_flag(struct mapped_device *md)
3018 {
3019         return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
3020 }
3021
3022 int dm_suspended(struct dm_target *ti)
3023 {
3024         return dm_suspended_md(dm_table_get_md(ti->table));
3025 }
3026 EXPORT_SYMBOL_GPL(dm_suspended);
3027
3028 int dm_post_suspending(struct dm_target *ti)
3029 {
3030         return dm_post_suspending_md(dm_table_get_md(ti->table));
3031 }
3032 EXPORT_SYMBOL_GPL(dm_post_suspending);
3033
3034 int dm_noflush_suspending(struct dm_target *ti)
3035 {
3036         return __noflush_suspending(dm_table_get_md(ti->table));
3037 }
3038 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
3039
3040 struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
3041                                             unsigned integrity, unsigned per_io_data_size,
3042                                             unsigned min_pool_size)
3043 {
3044         struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
3045         unsigned int pool_size = 0;
3046         unsigned int front_pad, io_front_pad;
3047         int ret;
3048
3049         if (!pools)
3050                 return NULL;
3051
3052         switch (type) {
3053         case DM_TYPE_BIO_BASED:
3054         case DM_TYPE_DAX_BIO_BASED:
3055         case DM_TYPE_NVME_BIO_BASED:
3056                 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
3057                 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
3058                 io_front_pad = roundup(front_pad,  __alignof__(struct dm_io)) + offsetof(struct dm_io, tio);
3059                 ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0);
3060                 if (ret)
3061                         goto out;
3062                 if (integrity && bioset_integrity_create(&pools->io_bs, pool_size))
3063                         goto out;
3064                 break;
3065         case DM_TYPE_REQUEST_BASED:
3066         case DM_TYPE_MQ_REQUEST_BASED:
3067                 pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
3068                 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
3069                 /* per_io_data_size is used for blk-mq pdu at queue allocation */
3070                 break;
3071         default:
3072                 BUG();
3073         }
3074
3075         ret = bioset_init(&pools->bs, pool_size, front_pad, 0);
3076         if (ret)
3077                 goto out;
3078
3079         if (integrity && bioset_integrity_create(&pools->bs, pool_size))
3080                 goto out;
3081
3082         return pools;
3083
3084 out:
3085         dm_free_md_mempools(pools);
3086
3087         return NULL;
3088 }
3089
3090 void dm_free_md_mempools(struct dm_md_mempools *pools)
3091 {
3092         if (!pools)
3093                 return;
3094
3095         bioset_exit(&pools->bs);
3096         bioset_exit(&pools->io_bs);
3097
3098         kfree(pools);
3099 }
3100
3101 struct dm_pr {
3102         u64     old_key;
3103         u64     new_key;
3104         u32     flags;
3105         bool    fail_early;
3106 };
3107
3108 static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
3109                       void *data)
3110 {
3111         struct mapped_device *md = bdev->bd_disk->private_data;
3112         struct dm_table *table;
3113         struct dm_target *ti;
3114         int ret = -ENOTTY, srcu_idx;
3115
3116         table = dm_get_live_table(md, &srcu_idx);
3117         if (!table || !dm_table_get_size(table))
3118                 goto out;
3119
3120         /* We only support devices that have a single target */
3121         if (dm_table_get_num_targets(table) != 1)
3122                 goto out;
3123         ti = dm_table_get_target(table, 0);
3124
3125         if (dm_suspended_md(md)) {
3126                 ret = -EAGAIN;
3127                 goto out;
3128         }
3129
3130         ret = -EINVAL;
3131         if (!ti->type->iterate_devices)
3132                 goto out;
3133
3134         ret = ti->type->iterate_devices(ti, fn, data);
3135 out:
3136         dm_put_live_table(md, srcu_idx);
3137         return ret;
3138 }
3139
3140 /*
3141  * For register / unregister we need to manually call out to every path.
3142  */
3143 static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
3144                             sector_t start, sector_t len, void *data)
3145 {
3146         struct dm_pr *pr = data;
3147         const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
3148
3149         if (!ops || !ops->pr_register)
3150                 return -EOPNOTSUPP;
3151         return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
3152 }
3153
3154 static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
3155                           u32 flags)
3156 {
3157         struct dm_pr pr = {
3158                 .old_key        = old_key,
3159                 .new_key        = new_key,
3160                 .flags          = flags,
3161                 .fail_early     = true,
3162         };
3163         int ret;
3164
3165         ret = dm_call_pr(bdev, __dm_pr_register, &pr);
3166         if (ret && new_key) {
3167                 /* unregister all paths if we failed to register any path */
3168                 pr.old_key = new_key;
3169                 pr.new_key = 0;
3170                 pr.flags = 0;
3171                 pr.fail_early = false;
3172                 dm_call_pr(bdev, __dm_pr_register, &pr);
3173         }
3174
3175         return ret;
3176 }
3177
3178 static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
3179                          u32 flags)
3180 {
3181         struct mapped_device *md = bdev->bd_disk->private_data;
3182         const struct pr_ops *ops;
3183         int r, srcu_idx;
3184
3185         r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3186         if (r < 0)
3187                 goto out;
3188
3189         ops = bdev->bd_disk->fops->pr_ops;
3190         if (ops && ops->pr_reserve)
3191                 r = ops->pr_reserve(bdev, key, type, flags);
3192         else
3193                 r = -EOPNOTSUPP;
3194 out:
3195         dm_unprepare_ioctl(md, srcu_idx);
3196         return r;
3197 }
3198
3199 static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
3200 {
3201         struct mapped_device *md = bdev->bd_disk->private_data;
3202         const struct pr_ops *ops;
3203         int r, srcu_idx;
3204
3205         r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3206         if (r < 0)
3207                 goto out;
3208
3209         ops = bdev->bd_disk->fops->pr_ops;
3210         if (ops && ops->pr_release)
3211                 r = ops->pr_release(bdev, key, type);
3212         else
3213                 r = -EOPNOTSUPP;
3214 out:
3215         dm_unprepare_ioctl(md, srcu_idx);
3216         return r;
3217 }
3218
3219 static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
3220                          enum pr_type type, bool abort)
3221 {
3222         struct mapped_device *md = bdev->bd_disk->private_data;
3223         const struct pr_ops *ops;
3224         int r, srcu_idx;
3225
3226         r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3227         if (r < 0)
3228                 goto out;
3229
3230         ops = bdev->bd_disk->fops->pr_ops;
3231         if (ops && ops->pr_preempt)
3232                 r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
3233         else
3234                 r = -EOPNOTSUPP;
3235 out:
3236         dm_unprepare_ioctl(md, srcu_idx);
3237         return r;
3238 }
3239
3240 static int dm_pr_clear(struct block_device *bdev, u64 key)
3241 {
3242         struct mapped_device *md = bdev->bd_disk->private_data;
3243         const struct pr_ops *ops;
3244         int r, srcu_idx;
3245
3246         r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3247         if (r < 0)
3248                 goto out;
3249
3250         ops = bdev->bd_disk->fops->pr_ops;
3251         if (ops && ops->pr_clear)
3252                 r = ops->pr_clear(bdev, key);
3253         else
3254                 r = -EOPNOTSUPP;
3255 out:
3256         dm_unprepare_ioctl(md, srcu_idx);
3257         return r;
3258 }
3259
3260 static const struct pr_ops dm_pr_ops = {
3261         .pr_register    = dm_pr_register,
3262         .pr_reserve     = dm_pr_reserve,
3263         .pr_release     = dm_pr_release,
3264         .pr_preempt     = dm_pr_preempt,
3265         .pr_clear       = dm_pr_clear,
3266 };
3267
3268 static const struct block_device_operations dm_blk_dops = {
3269         .open = dm_blk_open,
3270         .release = dm_blk_close,
3271         .ioctl = dm_blk_ioctl,
3272         .getgeo = dm_blk_getgeo,
3273         .pr_ops = &dm_pr_ops,
3274         .owner = THIS_MODULE
3275 };
3276
3277 static const struct dax_operations dm_dax_ops = {
3278         .direct_access = dm_dax_direct_access,
3279         .copy_from_iter = dm_dax_copy_from_iter,
3280         .copy_to_iter = dm_dax_copy_to_iter,
3281 };
3282
3283 /*
3284  * module hooks
3285  */
3286 module_init(dm_init);
3287 module_exit(dm_exit);
3288
3289 module_param(major, uint, 0);
3290 MODULE_PARM_DESC(major, "The major number of the device mapper");
3291
3292 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3293 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3294
3295 module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
3296 MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
3297
3298 module_param(swap_bios, int, S_IRUGO | S_IWUSR);
3299 MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs");
3300
3301 MODULE_DESCRIPTION(DM_NAME " driver");
3302 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3303 MODULE_LICENSE("GPL");