drivers/md/raid5.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * raid5.c : Multiple Devices driver for Linux
   4  *         Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
   5  *         Copyright (C) 1999, 2000 Ingo Molnar
   6  *         Copyright (C) 2002, 2003 H. Peter Anvin
   7  *
   8  * RAID-4/5/6 management functions.
   9  * Thanks to Penguin Computing for making the RAID-6 development possible
  10  * by donating a test server!
  11  */
  12
  13 /*
  14  * BITMAP UNPLUGGING:
  15  *
  16  * The sequencing for updating the bitmap reliably is a little
  17  * subtle (and I got it wrong the first time) so it deserves some
  18  * explanation.
  19  *
  20  * We group bitmap updates into batches.  Each batch has a number.
  21  * We may write out several batches at once, but that isn't very important.
  22  * conf->seq_write is the number of the last batch successfully written.
  23  * conf->seq_flush is the number of the last batch that was closed to
  24  *    new additions.
  25  * When we discover that we will need to write to any block in a stripe
  26  * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
  27  * the number of the batch it will be in. This is seq_flush+1.
  28  * When we are ready to do a write, if that batch hasn't been written yet,
  29  *   we plug the array and queue the stripe for later.
  30  * When an unplug happens, we increment bm_flush, thus closing the current
  31  *   batch.
  32  * When we notice that bm_flush > bm_write, we write out all pending updates
  33  * to the bitmap, and advance bm_write to where bm_flush was.
  34  * This may occasionally write a bit out twice, but is sure never to
  35  * miss any bits.
  36  */
  37
  38 #include <linux/blkdev.h>
  39 #include <linux/kthread.h>
  40 #include <linux/raid/pq.h>
  41 #include <linux/async_tx.h>
  42 #include <linux/module.h>
  43 #include <linux/async.h>
  44 #include <linux/seq_file.h>
  45 #include <linux/cpu.h>
  46 #include <linux/slab.h>
  47 #include <linux/ratelimit.h>
  48 #include <linux/nodemask.h>
  49
  50 #include <trace/events/block.h>
  51 #include <linux/list_sort.h>
  52
  53 #include "md.h"
  54 #include "raid5.h"
  55 #include "raid0.h"
  56 #include "md-bitmap.h"
  57 #include "raid5-log.h"
  58
  59 #define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
  60
  61 #define cpu_to_group(cpu) cpu_to_node(cpu)
  62 #define ANY_GROUP NUMA_NO_NODE
  63
  64 static bool devices_handle_discard_safely = false;
  65 module_param(devices_handle_discard_safely, bool, 0644);
  66 MODULE_PARM_DESC(devices_handle_discard_safely,
  67                  "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
  68 static struct workqueue_struct *raid5_wq;
  69
  70 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
  71 {
  72         int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK;
  73         return &conf->stripe_hashtbl[hash];
  74 }
  75
  76 static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect)
  77 {
  78         return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK;
  79 }
  80
  81 static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
  82         __acquires(&conf->device_lock)
  83 {
  84         spin_lock_irq(conf->hash_locks + hash);
  85         spin_lock(&conf->device_lock);
  86 }
  87
  88 static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
  89         __releases(&conf->device_lock)
  90 {
  91         spin_unlock(&conf->device_lock);
  92         spin_unlock_irq(conf->hash_locks + hash);
  93 }
  94
  95 static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
  96         __acquires(&conf->device_lock)
  97 {
  98         int i;
  99         spin_lock_irq(conf->hash_locks);
 100         for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
 101                 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
 102         spin_lock(&conf->device_lock);
 103 }
 104
 105 static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
 106         __releases(&conf->device_lock)
 107 {
 108         int i;
 109         spin_unlock(&conf->device_lock);
 110         for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
 111                 spin_unlock(conf->hash_locks + i);
 112         spin_unlock_irq(conf->hash_locks);
 113 }
 114
 115 /* Find first data disk in a raid6 stripe */
 116 static inline int raid6_d0(struct stripe_head *sh)
 117 {
 118         if (sh->ddf_layout)
 119                 /* ddf always start from first device */
 120                 return 0;
 121         /* md starts just after Q block */
 122         if (sh->qd_idx == sh->disks - 1)
 123                 return 0;
 124         else
 125                 return sh->qd_idx + 1;
 126 }
 127 static inline int raid6_next_disk(int disk, int raid_disks)
 128 {
 129         disk++;
 130         return (disk < raid_disks) ? disk : 0;
 131 }
 132
 133 /* When walking through the disks in a raid5, starting at raid6_d0,
 134  * We need to map each disk to a 'slot', where the data disks are slot
 135  * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
 136  * is raid_disks-1.  This help does that mapping.
 137  */
 138 static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
 139                              int *count, int syndrome_disks)
 140 {
 141         int slot = *count;
 142
 143         if (sh->ddf_layout)
 144                 (*count)++;
 145         if (idx == sh->pd_idx)
 146                 return syndrome_disks;
 147         if (idx == sh->qd_idx)
 148                 return syndrome_disks + 1;
 149         if (!sh->ddf_layout)
 150                 (*count)++;
 151         return slot;
 152 }
 153
 154 static void print_raid5_conf (struct r5conf *conf);
 155
 156 static int stripe_operations_active(struct stripe_head *sh)
 157 {
 158         return sh->check_state || sh->reconstruct_state ||
 159                test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
 160                test_bit(STRIPE_COMPUTE_RUN, &sh->state);
 161 }
 162
 163 static bool stripe_is_lowprio(struct stripe_head *sh)
 164 {
 165         return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) ||
 166                 test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
 167                !test_bit(STRIPE_R5C_CACHING, &sh->state);
 168 }
 169
 170 static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
 171         __must_hold(&sh->raid_conf->device_lock)
 172 {
 173         struct r5conf *conf = sh->raid_conf;
 174         struct r5worker_group *group;
 175         int thread_cnt;
 176         int i, cpu = sh->cpu;
 177
 178         if (!cpu_online(cpu)) {
 179                 cpu = cpumask_any(cpu_online_mask);
 180                 sh->cpu = cpu;
 181         }
 182
 183         if (list_empty(&sh->lru)) {
 184                 struct r5worker_group *group;
 185                 group = conf->worker_groups + cpu_to_group(cpu);
 186                 if (stripe_is_lowprio(sh))
 187                         list_add_tail(&sh->lru, &group->loprio_list);
 188                 else
 189                         list_add_tail(&sh->lru, &group->handle_list);
 190                 group->stripes_cnt++;
 191                 sh->group = group;
 192         }
 193
 194         if (conf->worker_cnt_per_group == 0) {
 195                 md_wakeup_thread(conf->mddev->thread);
 196                 return;
 197         }
 198
 199         group = conf->worker_groups + cpu_to_group(sh->cpu);
 200
 201         group->workers[0].working = true;
 202         /* at least one worker should run to avoid race */
 203         queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work);
 204
 205         thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1;
 206         /* wakeup more workers */
 207         for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) {
 208                 if (group->workers[i].working == false) {
 209                         group->workers[i].working = true;
 210                         queue_work_on(sh->cpu, raid5_wq,
 211                                       &group->workers[i].work);
 212                         thread_cnt--;
 213                 }
 214         }
 215 }
 216
 217 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 218                               struct list_head *temp_inactive_list)
 219         __must_hold(&conf->device_lock)
 220 {
 221         int i;
 222         int injournal = 0;      /* number of date pages with R5_InJournal */
 223
 224         BUG_ON(!list_empty(&sh->lru));
 225         BUG_ON(atomic_read(&conf->active_stripes)==0);
 226
 227         if (r5c_is_writeback(conf->log))
 228                 for (i = sh->disks; i--; )
 229                         if (test_bit(R5_InJournal, &sh->dev[i].flags))
 230                                 injournal++;
 231         /*
 232          * In the following cases, the stripe cannot be released to cached
 233          * lists. Therefore, we make the stripe write out and set
 234          * STRIPE_HANDLE:
 235          *   1. when quiesce in r5c write back;
 236          *   2. when resync is requested fot the stripe.
 237          */
 238         if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) ||
 239             (conf->quiesce && r5c_is_writeback(conf->log) &&
 240              !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) {
 241                 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
 242                         r5c_make_stripe_write_out(sh);
 243                 set_bit(STRIPE_HANDLE, &sh->state);
 244         }
 245
 246         if (test_bit(STRIPE_HANDLE, &sh->state)) {
 247                 if (test_bit(STRIPE_DELAYED, &sh->state) &&
 248                     !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 249                         list_add_tail(&sh->lru, &conf->delayed_list);
 250                 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
 251                            sh->bm_seq - conf->seq_write > 0)
 252                         list_add_tail(&sh->lru, &conf->bitmap_list);
 253                 else {
 254                         clear_bit(STRIPE_DELAYED, &sh->state);
 255                         clear_bit(STRIPE_BIT_DELAY, &sh->state);
 256                         if (conf->worker_cnt_per_group == 0) {
 257                                 if (stripe_is_lowprio(sh))
 258                                         list_add_tail(&sh->lru,
 259                                                         &conf->loprio_list);
 260                                 else
 261                                         list_add_tail(&sh->lru,
 262                                                         &conf->handle_list);
 263                         } else {
 264                                 raid5_wakeup_stripe_thread(sh);
 265                                 return;
 266                         }
 267                 }
 268                 md_wakeup_thread(conf->mddev->thread);
 269         } else {
 270                 BUG_ON(stripe_operations_active(sh));
 271                 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 272                         if (atomic_dec_return(&conf->preread_active_stripes)
 273                             < IO_THRESHOLD)
 274                                 md_wakeup_thread(conf->mddev->thread);
 275                 atomic_dec(&conf->active_stripes);
 276                 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
 277                         if (!r5c_is_writeback(conf->log))
 278                                 list_add_tail(&sh->lru, temp_inactive_list);
 279                         else {
 280                                 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags));
 281                                 if (injournal == 0)
 282                                         list_add_tail(&sh->lru, temp_inactive_list);
 283                                 else if (injournal == conf->raid_disks - conf->max_degraded) {
 284                                         /* full stripe */
 285                                         if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
 286                                                 atomic_inc(&conf->r5c_cached_full_stripes);
 287                                         if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
 288                                                 atomic_dec(&conf->r5c_cached_partial_stripes);
 289                                         list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
 290                                         r5c_check_cached_full_stripe(conf);
 291                                 } else
 292                                         /*
 293                                          * STRIPE_R5C_PARTIAL_STRIPE is set in
 294                                          * r5c_try_caching_write(). No need to
 295                                          * set it again.
 296                                          */
 297                                         list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
 298                         }
 299                 }
 300         }
 301 }
 302
 303 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
 304                              struct list_head *temp_inactive_list)
 305         __must_hold(&conf->device_lock)
 306 {
 307         if (atomic_dec_and_test(&sh->count))
 308                 do_release_stripe(conf, sh, temp_inactive_list);
 309 }
 310
 311 /*
 312  * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
 313  *
 314  * Be careful: Only one task can add/delete stripes from temp_inactive_list at
 315  * given time. Adding stripes only takes device lock, while deleting stripes
 316  * only takes hash lock.
 317  */
 318 static void release_inactive_stripe_list(struct r5conf *conf,
 319                                          struct list_head *temp_inactive_list,
 320                                          int hash)
 321 {
 322         int size;
 323         bool do_wakeup = false;
 324         unsigned long flags;
 325
 326         if (hash == NR_STRIPE_HASH_LOCKS) {
 327                 size = NR_STRIPE_HASH_LOCKS;
 328                 hash = NR_STRIPE_HASH_LOCKS - 1;
 329         } else
 330                 size = 1;
 331         while (size) {
 332                 struct list_head *list = &temp_inactive_list[size - 1];
 333
 334                 /*
 335                  * We don't hold any lock here yet, raid5_get_active_stripe() might
 336                  * remove stripes from the list
 337                  */
 338                 if (!list_empty_careful(list)) {
 339                         spin_lock_irqsave(conf->hash_locks + hash, flags);
 340                         if (list_empty(conf->inactive_list + hash) &&
 341                             !list_empty(list))
 342                                 atomic_dec(&conf->empty_inactive_list_nr);
 343                         list_splice_tail_init(list, conf->inactive_list + hash);
 344                         do_wakeup = true;
 345                         spin_unlock_irqrestore(conf->hash_locks + hash, flags);
 346                 }
 347                 size--;
 348                 hash--;
 349         }
 350
 351         if (do_wakeup) {
 352                 wake_up(&conf->wait_for_stripe);
 353                 if (atomic_read(&conf->active_stripes) == 0)
 354                         wake_up(&conf->wait_for_quiescent);
 355                 if (conf->retry_read_aligned)
 356                         md_wakeup_thread(conf->mddev->thread);
 357         }
 358 }
 359
 360 static int release_stripe_list(struct r5conf *conf,
 361                                struct list_head *temp_inactive_list)
 362         __must_hold(&conf->device_lock)
 363 {
 364         struct stripe_head *sh, *t;
 365         int count = 0;
 366         struct llist_node *head;
 367
 368         head = llist_del_all(&conf->released_stripes);
 369         head = llist_reverse_order(head);
 370         llist_for_each_entry_safe(sh, t, head, release_list) {
 371                 int hash;
 372
 373                 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
 374                 smp_mb();
 375                 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
 376                 /*
 377                  * Don't worry the bit is set here, because if the bit is set
 378                  * again, the count is always > 1. This is true for
 379                  * STRIPE_ON_UNPLUG_LIST bit too.
 380                  */
 381                 hash = sh->hash_lock_index;
 382                 __release_stripe(conf, sh, &temp_inactive_list[hash]);
 383                 count++;
 384         }
 385
 386         return count;
 387 }
 388
 389 void raid5_release_stripe(struct stripe_head *sh)
 390 {
 391         struct r5conf *conf = sh->raid_conf;
 392         unsigned long flags;
 393         struct list_head list;
 394         int hash;
 395         bool wakeup;
 396
 397         /* Avoid release_list until the last reference.
 398          */
 399         if (atomic_add_unless(&sh->count, -1, 1))
 400                 return;
 401
 402         if (unlikely(!conf->mddev->thread) ||
 403                 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
 404                 goto slow_path;
 405         wakeup = llist_add(&sh->release_list, &conf->released_stripes);
 406         if (wakeup)
 407                 md_wakeup_thread(conf->mddev->thread);
 408         return;
 409 slow_path:
 410         /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
 411         if (atomic_dec_and_lock_irqsave(&sh->count, &conf->device_lock, flags)) {
 412                 INIT_LIST_HEAD(&list);
 413                 hash = sh->hash_lock_index;
 414                 do_release_stripe(conf, sh, &list);
 415                 spin_unlock_irqrestore(&conf->device_lock, flags);
 416                 release_inactive_stripe_list(conf, &list, hash);
 417         }
 418 }
 419
 420 static inline void remove_hash(struct stripe_head *sh)
 421 {
 422         pr_debug("remove_hash(), stripe %llu\n",
 423                 (unsigned long long)sh->sector);
 424
 425         hlist_del_init(&sh->hash);
 426 }
 427
 428 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
 429 {
 430         struct hlist_head *hp = stripe_hash(conf, sh->sector);
 431
 432         pr_debug("insert_hash(), stripe %llu\n",
 433                 (unsigned long long)sh->sector);
 434
 435         hlist_add_head(&sh->hash, hp);
 436 }
 437
 438 /* find an idle stripe, make sure it is unhashed, and return it. */
 439 static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
 440 {
 441         struct stripe_head *sh = NULL;
 442         struct list_head *first;
 443
 444         if (list_empty(conf->inactive_list + hash))
 445                 goto out;
 446         first = (conf->inactive_list + hash)->next;
 447         sh = list_entry(first, struct stripe_head, lru);
 448         list_del_init(first);
 449         remove_hash(sh);
 450         atomic_inc(&conf->active_stripes);
 451         BUG_ON(hash != sh->hash_lock_index);
 452         if (list_empty(conf->inactive_list + hash))
 453                 atomic_inc(&conf->empty_inactive_list_nr);
 454 out:
 455         return sh;
 456 }
 457
 458 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
 459 static void free_stripe_pages(struct stripe_head *sh)
 460 {
 461         int i;
 462         struct page *p;
 463
 464         /* Have not allocate page pool */
 465         if (!sh->pages)
 466                 return;
 467
 468         for (i = 0; i < sh->nr_pages; i++) {
 469                 p = sh->pages[i];
 470                 if (p)
 471                         put_page(p);
 472                 sh->pages[i] = NULL;
 473         }
 474 }
 475
 476 static int alloc_stripe_pages(struct stripe_head *sh, gfp_t gfp)
 477 {
 478         int i;
 479         struct page *p;
 480
 481         for (i = 0; i < sh->nr_pages; i++) {
 482                 /* The page have allocated. */
 483                 if (sh->pages[i])
 484                         continue;
 485
 486                 p = alloc_page(gfp);
 487                 if (!p) {
 488                         free_stripe_pages(sh);
 489                         return -ENOMEM;
 490                 }
 491                 sh->pages[i] = p;
 492         }
 493         return 0;
 494 }
 495
 496 static int
 497 init_stripe_shared_pages(struct stripe_head *sh, struct r5conf *conf, int disks)
 498 {
 499         int nr_pages, cnt;
 500
 501         if (sh->pages)
 502                 return 0;
 503
 504         /* Each of the sh->dev[i] need one conf->stripe_size */
 505         cnt = PAGE_SIZE / conf->stripe_size;
 506         nr_pages = (disks + cnt - 1) / cnt;
 507
 508         sh->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
 509         if (!sh->pages)
 510                 return -ENOMEM;
 511         sh->nr_pages = nr_pages;
 512         sh->stripes_per_page = cnt;
 513         return 0;
 514 }
 515 #endif
 516
 517 static void shrink_buffers(struct stripe_head *sh)
 518 {
 519         int i;
 520         int num = sh->raid_conf->pool_size;
 521
 522 #if PAGE_SIZE == DEFAULT_STRIPE_SIZE
 523         for (i = 0; i < num ; i++) {
 524                 struct page *p;
 525
 526                 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
 527                 p = sh->dev[i].page;
 528                 if (!p)
 529                         continue;
 530                 sh->dev[i].page = NULL;
 531                 put_page(p);
 532         }
 533 #else
 534         for (i = 0; i < num; i++)
 535                 sh->dev[i].page = NULL;
 536         free_stripe_pages(sh); /* Free pages */
 537 #endif
 538 }
 539
 540 static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
 541 {
 542         int i;
 543         int num = sh->raid_conf->pool_size;
 544
 545 #if PAGE_SIZE == DEFAULT_STRIPE_SIZE
 546         for (i = 0; i < num; i++) {
 547                 struct page *page;
 548
 549                 if (!(page = alloc_page(gfp))) {
 550                         return 1;
 551                 }
 552                 sh->dev[i].page = page;
 553                 sh->dev[i].orig_page = page;
 554                 sh->dev[i].offset = 0;
 555         }
 556 #else
 557         if (alloc_stripe_pages(sh, gfp))
 558                 return -ENOMEM;
 559
 560         for (i = 0; i < num; i++) {
 561                 sh->dev[i].page = raid5_get_dev_page(sh, i);
 562                 sh->dev[i].orig_page = sh->dev[i].page;
 563                 sh->dev[i].offset = raid5_get_page_offset(sh, i);
 564         }
 565 #endif
 566         return 0;
 567 }
 568
 569 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
 570                             struct stripe_head *sh);
 571
 572 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
 573 {
 574         struct r5conf *conf = sh->raid_conf;
 575         int i, seq;
 576
 577         BUG_ON(atomic_read(&sh->count) != 0);
 578         BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
 579         BUG_ON(stripe_operations_active(sh));
 580         BUG_ON(sh->batch_head);
 581
 582         pr_debug("init_stripe called, stripe %llu\n",
 583                 (unsigned long long)sector);
 584 retry:
 585         seq = read_seqcount_begin(&conf->gen_lock);
 586         sh->generation = conf->generation - previous;
 587         sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
 588         sh->sector = sector;
 589         stripe_set_idx(sector, conf, previous, sh);
 590         sh->state = 0;
 591
 592         for (i = sh->disks; i--; ) {
 593                 struct r5dev *dev = &sh->dev[i];
 594
 595                 if (dev->toread || dev->read || dev->towrite || dev->written ||
 596                     test_bit(R5_LOCKED, &dev->flags)) {
 597                         pr_err("sector=%llx i=%d %p %p %p %p %d\n",
 598                                (unsigned long long)sh->sector, i, dev->toread,
 599                                dev->read, dev->towrite, dev->written,
 600                                test_bit(R5_LOCKED, &dev->flags));
 601                         WARN_ON(1);
 602                 }
 603                 dev->flags = 0;
 604                 dev->sector = raid5_compute_blocknr(sh, i, previous);
 605         }
 606         if (read_seqcount_retry(&conf->gen_lock, seq))
 607                 goto retry;
 608         sh->overwrite_disks = 0;
 609         insert_hash(conf, sh);
 610         sh->cpu = smp_processor_id();
 611         set_bit(STRIPE_BATCH_READY, &sh->state);
 612 }
 613
 614 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
 615                                          short generation)
 616 {
 617         struct stripe_head *sh;
 618
 619         pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
 620         hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
 621                 if (sh->sector == sector && sh->generation == generation)
 622                         return sh;
 623         pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
 624         return NULL;
 625 }
 626
 627 /*
 628  * Need to check if array has failed when deciding whether to:
 629  *  - start an array
 630  *  - remove non-faulty devices
 631  *  - add a spare
 632  *  - allow a reshape
 633  * This determination is simple when no reshape is happening.
 634  * However if there is a reshape, we need to carefully check
 635  * both the before and after sections.
 636  * This is because some failed devices may only affect one
 637  * of the two sections, and some non-in_sync devices may
 638  * be insync in the section most affected by failed devices.
 639  *
 640  * Most calls to this function hold &conf->device_lock. Calls
 641  * in raid5_run() do not require the lock as no other threads
 642  * have been started yet.
 643  */
 644 int raid5_calc_degraded(struct r5conf *conf)
 645 {
 646         int degraded, degraded2;
 647         int i;
 648
 649         rcu_read_lock();
 650         degraded = 0;
 651         for (i = 0; i < conf->previous_raid_disks; i++) {
 652                 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
 653                 if (rdev && test_bit(Faulty, &rdev->flags))
 654                         rdev = rcu_dereference(conf->disks[i].replacement);
 655                 if (!rdev || test_bit(Faulty, &rdev->flags))
 656                         degraded++;
 657                 else if (test_bit(In_sync, &rdev->flags))
 658                         ;
 659                 else
 660                         /* not in-sync or faulty.
 661                          * If the reshape increases the number of devices,
 662                          * this is being recovered by the reshape, so
 663                          * this 'previous' section is not in_sync.
 664                          * If the number of devices is being reduced however,
 665                          * the device can only be part of the array if
 666                          * we are reverting a reshape, so this section will
 667                          * be in-sync.
 668                          */
 669                         if (conf->raid_disks >= conf->previous_raid_disks)
 670                                 degraded++;
 671         }
 672         rcu_read_unlock();
 673         if (conf->raid_disks == conf->previous_raid_disks)
 674                 return degraded;
 675         rcu_read_lock();
 676         degraded2 = 0;
 677         for (i = 0; i < conf->raid_disks; i++) {
 678                 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
 679                 if (rdev && test_bit(Faulty, &rdev->flags))
 680                         rdev = rcu_dereference(conf->disks[i].replacement);
 681                 if (!rdev || test_bit(Faulty, &rdev->flags))
 682                         degraded2++;
 683                 else if (test_bit(In_sync, &rdev->flags))
 684                         ;
 685                 else
 686                         /* not in-sync or faulty.
 687                          * If reshape increases the number of devices, this
 688                          * section has already been recovered, else it
 689                          * almost certainly hasn't.
 690                          */
 691                         if (conf->raid_disks <= conf->previous_raid_disks)
 692                                 degraded2++;
 693         }
 694         rcu_read_unlock();
 695         if (degraded2 > degraded)
 696                 return degraded2;
 697         return degraded;
 698 }
 699
 700 static bool has_failed(struct r5conf *conf)
 701 {
 702         int degraded = conf->mddev->degraded;
 703
 704         if (test_bit(MD_BROKEN, &conf->mddev->flags))
 705                 return true;
 706
 707         if (conf->mddev->reshape_position != MaxSector)
 708                 degraded = raid5_calc_degraded(conf);
 709
 710         return degraded > conf->max_degraded;
 711 }
 712
 713 struct stripe_head *
 714 raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
 715                         int previous, int noblock, int noquiesce)
 716 {
 717         struct stripe_head *sh;
 718         int hash = stripe_hash_locks_hash(conf, sector);
 719         int inc_empty_inactive_list_flag;
 720
 721         pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
 722
 723         spin_lock_irq(conf->hash_locks + hash);
 724
 725         do {
 726                 wait_event_lock_irq(conf->wait_for_quiescent,
 727                                     conf->quiesce == 0 || noquiesce,
 728                                     *(conf->hash_locks + hash));
 729                 sh = __find_stripe(conf, sector, conf->generation - previous);
 730                 if (!sh) {
 731                         if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
 732                                 sh = get_free_stripe(conf, hash);
 733                                 if (!sh && !test_bit(R5_DID_ALLOC,
 734                                                      &conf->cache_state))
 735                                         set_bit(R5_ALLOC_MORE,
 736                                                 &conf->cache_state);
 737                         }
 738                         if (noblock && sh == NULL)
 739                                 break;
 740
 741                         r5c_check_stripe_cache_usage(conf);
 742                         if (!sh) {
 743                                 set_bit(R5_INACTIVE_BLOCKED,
 744                                         &conf->cache_state);
 745                                 r5l_wake_reclaim(conf->log, 0);
 746                                 wait_event_lock_irq(
 747                                         conf->wait_for_stripe,
 748                                         !list_empty(conf->inactive_list + hash) &&
 749                                         (atomic_read(&conf->active_stripes)
 750                                          < (conf->max_nr_stripes * 3 / 4)
 751                                          || !test_bit(R5_INACTIVE_BLOCKED,
 752                                                       &conf->cache_state)),
 753                                         *(conf->hash_locks + hash));
 754                                 clear_bit(R5_INACTIVE_BLOCKED,
 755                                           &conf->cache_state);
 756                         } else {
 757                                 init_stripe(sh, sector, previous);
 758                                 atomic_inc(&sh->count);
 759                         }
 760                 } else if (!atomic_inc_not_zero(&sh->count)) {
 761                         spin_lock(&conf->device_lock);
 762                         if (!atomic_read(&sh->count)) {
 763                                 if (!test_bit(STRIPE_HANDLE, &sh->state))
 764                                         atomic_inc(&conf->active_stripes);
 765                                 BUG_ON(list_empty(&sh->lru) &&
 766                                        !test_bit(STRIPE_EXPANDING, &sh->state));
 767                                 inc_empty_inactive_list_flag = 0;
 768                                 if (!list_empty(conf->inactive_list + hash))
 769                                         inc_empty_inactive_list_flag = 1;
 770                                 list_del_init(&sh->lru);
 771                                 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
 772                                         atomic_inc(&conf->empty_inactive_list_nr);
 773                                 if (sh->group) {
 774                                         sh->group->stripes_cnt--;
 775                                         sh->group = NULL;
 776                                 }
 777                         }
 778                         atomic_inc(&sh->count);
 779                         spin_unlock(&conf->device_lock);
 780                 }
 781         } while (sh == NULL);
 782
 783         spin_unlock_irq(conf->hash_locks + hash);
 784         return sh;
 785 }
 786
 787 static bool is_full_stripe_write(struct stripe_head *sh)
 788 {
 789         BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
 790         return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
 791 }
 792
 793 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
 794                 __acquires(&sh1->stripe_lock)
 795                 __acquires(&sh2->stripe_lock)
 796 {
 797         if (sh1 > sh2) {
 798                 spin_lock_irq(&sh2->stripe_lock);
 799                 spin_lock_nested(&sh1->stripe_lock, 1);
 800         } else {
 801                 spin_lock_irq(&sh1->stripe_lock);
 802                 spin_lock_nested(&sh2->stripe_lock, 1);
 803         }
 804 }
 805
 806 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
 807                 __releases(&sh1->stripe_lock)
 808                 __releases(&sh2->stripe_lock)
 809 {
 810         spin_unlock(&sh1->stripe_lock);
 811         spin_unlock_irq(&sh2->stripe_lock);
 812 }
 813
 814 /* Only freshly new full stripe normal write stripe can be added to a batch list */
 815 static bool stripe_can_batch(struct stripe_head *sh)
 816 {
 817         struct r5conf *conf = sh->raid_conf;
 818
 819         if (raid5_has_log(conf) || raid5_has_ppl(conf))
 820                 return false;
 821         return test_bit(STRIPE_BATCH_READY, &sh->state) &&
 822                 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
 823                 is_full_stripe_write(sh);
 824 }
 825
 826 /* we only do back search */
 827 static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh)
 828 {
 829         struct stripe_head *head;
 830         sector_t head_sector, tmp_sec;
 831         int hash;
 832         int dd_idx;
 833         int inc_empty_inactive_list_flag;
 834
 835         /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */
 836         tmp_sec = sh->sector;
 837         if (!sector_div(tmp_sec, conf->chunk_sectors))
 838                 return;
 839         head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
 840
 841         hash = stripe_hash_locks_hash(conf, head_sector);
 842         spin_lock_irq(conf->hash_locks + hash);
 843         head = __find_stripe(conf, head_sector, conf->generation);
 844         if (head && !atomic_inc_not_zero(&head->count)) {
 845                 spin_lock(&conf->device_lock);
 846                 if (!atomic_read(&head->count)) {
 847                         if (!test_bit(STRIPE_HANDLE, &head->state))
 848                                 atomic_inc(&conf->active_stripes);
 849                         BUG_ON(list_empty(&head->lru) &&
 850                                !test_bit(STRIPE_EXPANDING, &head->state));
 851                         inc_empty_inactive_list_flag = 0;
 852                         if (!list_empty(conf->inactive_list + hash))
 853                                 inc_empty_inactive_list_flag = 1;
 854                         list_del_init(&head->lru);
 855                         if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
 856                                 atomic_inc(&conf->empty_inactive_list_nr);
 857                         if (head->group) {
 858                                 head->group->stripes_cnt--;
 859                                 head->group = NULL;
 860                         }
 861                 }
 862                 atomic_inc(&head->count);
 863                 spin_unlock(&conf->device_lock);
 864         }
 865         spin_unlock_irq(conf->hash_locks + hash);
 866
 867         if (!head)
 868                 return;
 869         if (!stripe_can_batch(head))
 870                 goto out;
 871
 872         lock_two_stripes(head, sh);
 873         /* clear_batch_ready clear the flag */
 874         if (!stripe_can_batch(head) || !stripe_can_batch(sh))
 875                 goto unlock_out;
 876
 877         if (sh->batch_head)
 878                 goto unlock_out;
 879
 880         dd_idx = 0;
 881         while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
 882                 dd_idx++;
 883         if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf ||
 884             bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite))
 885                 goto unlock_out;
 886
 887         if (head->batch_head) {
 888                 spin_lock(&head->batch_head->batch_lock);
 889                 /* This batch list is already running */
 890                 if (!stripe_can_batch(head)) {
 891                         spin_unlock(&head->batch_head->batch_lock);
 892                         goto unlock_out;
 893                 }
 894                 /*
 895                  * We must assign batch_head of this stripe within the
 896                  * batch_lock, otherwise clear_batch_ready of batch head
 897                  * stripe could clear BATCH_READY bit of this stripe and
 898                  * this stripe->batch_head doesn't get assigned, which
 899                  * could confuse clear_batch_ready for this stripe
 900                  */
 901                 sh->batch_head = head->batch_head;
 902
 903                 /*
 904                  * at this point, head's BATCH_READY could be cleared, but we
 905                  * can still add the stripe to batch list
 906                  */
 907                 list_add(&sh->batch_list, &head->batch_list);
 908                 spin_unlock(&head->batch_head->batch_lock);
 909         } else {
 910                 head->batch_head = head;
 911                 sh->batch_head = head->batch_head;
 912                 spin_lock(&head->batch_lock);
 913                 list_add_tail(&sh->batch_list, &head->batch_list);
 914                 spin_unlock(&head->batch_lock);
 915         }
 916
 917         if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 918                 if (atomic_dec_return(&conf->preread_active_stripes)
 919                     < IO_THRESHOLD)
 920                         md_wakeup_thread(conf->mddev->thread);
 921
 922         if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) {
 923                 int seq = sh->bm_seq;
 924                 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
 925                     sh->batch_head->bm_seq > seq)
 926                         seq = sh->batch_head->bm_seq;
 927                 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state);
 928                 sh->batch_head->bm_seq = seq;
 929         }
 930
 931         atomic_inc(&sh->count);
 932 unlock_out:
 933         unlock_two_stripes(head, sh);
 934 out:
 935         raid5_release_stripe(head);
 936 }
 937
 938 /* Determine if 'data_offset' or 'new_data_offset' should be used
 939  * in this stripe_head.
 940  */
 941 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
 942 {
 943         sector_t progress = conf->reshape_progress;
 944         /* Need a memory barrier to make sure we see the value
 945          * of conf->generation, or ->data_offset that was set before
 946          * reshape_progress was updated.
 947          */
 948         smp_rmb();
 949         if (progress == MaxSector)
 950                 return 0;
 951         if (sh->generation == conf->generation - 1)
 952                 return 0;
 953         /* We are in a reshape, and this is a new-generation stripe,
 954          * so use new_data_offset.
 955          */
 956         return 1;
 957 }
 958
 959 static void dispatch_bio_list(struct bio_list *tmp)
 960 {
 961         struct bio *bio;
 962
 963         while ((bio = bio_list_pop(tmp)))
 964                 submit_bio_noacct(bio);
 965 }
 966
 967 static int cmp_stripe(void *priv, const struct list_head *a,
 968                       const struct list_head *b)
 969 {
 970         const struct r5pending_data *da = list_entry(a,
 971                                 struct r5pending_data, sibling);
 972         const struct r5pending_data *db = list_entry(b,
 973                                 struct r5pending_data, sibling);
 974         if (da->sector > db->sector)
 975                 return 1;
 976         if (da->sector < db->sector)
 977                 return -1;
 978         return 0;
 979 }
 980
 981 static void dispatch_defer_bios(struct r5conf *conf, int target,
 982                                 struct bio_list *list)
 983 {
 984         struct r5pending_data *data;
 985         struct list_head *first, *next = NULL;
 986         int cnt = 0;
 987
 988         if (conf->pending_data_cnt == 0)
 989                 return;
 990
 991         list_sort(NULL, &conf->pending_list, cmp_stripe);
 992
 993         first = conf->pending_list.next;
 994
 995         /* temporarily move the head */
 996         if (conf->next_pending_data)
 997                 list_move_tail(&conf->pending_list,
 998                                 &conf->next_pending_data->sibling);
 999
1000         while (!list_empty(&conf->pending_list)) {
1001                 data = list_first_entry(&conf->pending_list,
1002                         struct r5pending_data, sibling);
1003                 if (&data->sibling == first)
1004                         first = data->sibling.next;
1005                 next = data->sibling.next;
1006
1007                 bio_list_merge(list, &data->bios);
1008                 list_move(&data->sibling, &conf->free_list);
1009                 cnt++;
1010                 if (cnt >= target)
1011                         break;
1012         }
1013         conf->pending_data_cnt -= cnt;
1014         BUG_ON(conf->pending_data_cnt < 0 || cnt < target);
1015
1016         if (next != &conf->pending_list)
1017                 conf->next_pending_data = list_entry(next,
1018                                 struct r5pending_data, sibling);
1019         else
1020                 conf->next_pending_data = NULL;
1021         /* list isn't empty */
1022         if (first != &conf->pending_list)
1023                 list_move_tail(&conf->pending_list, first);
1024 }
1025
1026 static void flush_deferred_bios(struct r5conf *conf)
1027 {
1028         struct bio_list tmp = BIO_EMPTY_LIST;
1029
1030         if (conf->pending_data_cnt == 0)
1031                 return;
1032
1033         spin_lock(&conf->pending_bios_lock);
1034         dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp);
1035         BUG_ON(conf->pending_data_cnt != 0);
1036         spin_unlock(&conf->pending_bios_lock);
1037
1038         dispatch_bio_list(&tmp);
1039 }
1040
1041 static void defer_issue_bios(struct r5conf *conf, sector_t sector,
1042                                 struct bio_list *bios)
1043 {
1044         struct bio_list tmp = BIO_EMPTY_LIST;
1045         struct r5pending_data *ent;
1046
1047         spin_lock(&conf->pending_bios_lock);
1048         ent = list_first_entry(&conf->free_list, struct r5pending_data,
1049                                                         sibling);
1050         list_move_tail(&ent->sibling, &conf->pending_list);
1051         ent->sector = sector;
1052         bio_list_init(&ent->bios);
1053         bio_list_merge(&ent->bios, bios);
1054         conf->pending_data_cnt++;
1055         if (conf->pending_data_cnt >= PENDING_IO_MAX)
1056                 dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp);
1057
1058         spin_unlock(&conf->pending_bios_lock);
1059
1060         dispatch_bio_list(&tmp);
1061 }
1062
1063 static void
1064 raid5_end_read_request(struct bio *bi);
1065 static void
1066 raid5_end_write_request(struct bio *bi);
1067
1068 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
1069 {
1070         struct r5conf *conf = sh->raid_conf;
1071         int i, disks = sh->disks;
1072         struct stripe_head *head_sh = sh;
1073         struct bio_list pending_bios = BIO_EMPTY_LIST;
1074         struct r5dev *dev;
1075         bool should_defer;
1076
1077         might_sleep();
1078
1079         if (log_stripe(sh, s) == 0)
1080                 return;
1081
1082         should_defer = conf->batch_bio_dispatch && conf->group_cnt;
1083
1084         for (i = disks; i--; ) {
1085                 int op, op_flags = 0;
1086                 int replace_only = 0;
1087                 struct bio *bi, *rbi;
1088                 struct md_rdev *rdev, *rrdev = NULL;
1089
1090                 sh = head_sh;
1091                 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
1092                         op = REQ_OP_WRITE;
1093                         if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
1094                                 op_flags = REQ_FUA;
1095                         if (test_bit(R5_Discard, &sh->dev[i].flags))
1096                                 op = REQ_OP_DISCARD;
1097                 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
1098                         op = REQ_OP_READ;
1099                 else if (test_and_clear_bit(R5_WantReplace,
1100                                             &sh->dev[i].flags)) {
1101                         op = REQ_OP_WRITE;
1102                         replace_only = 1;
1103                 } else
1104                         continue;
1105                 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
1106                         op_flags |= REQ_SYNC;
1107
1108 again:
1109                 dev = &sh->dev[i];
1110                 bi = &dev->req;
1111                 rbi = &dev->rreq; /* For writing to replacement */
1112
1113                 rcu_read_lock();
1114                 rrdev = rcu_dereference(conf->disks[i].replacement);
1115                 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
1116                 rdev = rcu_dereference(conf->disks[i].rdev);
1117                 if (!rdev) {
1118                         rdev = rrdev;
1119                         rrdev = NULL;
1120                 }
1121                 if (op_is_write(op)) {
1122                         if (replace_only)
1123                                 rdev = NULL;
1124                         if (rdev == rrdev)
1125                                 /* We raced and saw duplicates */
1126                                 rrdev = NULL;
1127                 } else {
1128                         if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
1129                                 rdev = rrdev;
1130                         rrdev = NULL;
1131                 }
1132
1133                 if (rdev && test_bit(Faulty, &rdev->flags))
1134                         rdev = NULL;
1135                 if (rdev)
1136                         atomic_inc(&rdev->nr_pending);
1137                 if (rrdev && test_bit(Faulty, &rrdev->flags))
1138                         rrdev = NULL;
1139                 if (rrdev)
1140                         atomic_inc(&rrdev->nr_pending);
1141                 rcu_read_unlock();
1142
1143                 /* We have already checked bad blocks for reads.  Now
1144                  * need to check for writes.  We never accept write errors
1145                  * on the replacement, so we don't to check rrdev.
1146                  */
1147                 while (op_is_write(op) && rdev &&
1148                        test_bit(WriteErrorSeen, &rdev->flags)) {
1149                         sector_t first_bad;
1150                         int bad_sectors;
1151                         int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
1152                                               &first_bad, &bad_sectors);
1153                         if (!bad)
1154                                 break;
1155
1156                         if (bad < 0) {
1157                                 set_bit(BlockedBadBlocks, &rdev->flags);
1158                                 if (!conf->mddev->external &&
1159                                     conf->mddev->sb_flags) {
1160                                         /* It is very unlikely, but we might
1161                                          * still need to write out the
1162                                          * bad block log - better give it
1163                                          * a chance*/
1164                                         md_check_recovery(conf->mddev);
1165                                 }
1166                                 /*
1167                                  * Because md_wait_for_blocked_rdev
1168                                  * will dec nr_pending, we must
1169                                  * increment it first.
1170                                  */
1171                                 atomic_inc(&rdev->nr_pending);
1172                                 md_wait_for_blocked_rdev(rdev, conf->mddev);
1173                         } else {
1174                                 /* Acknowledged bad block - skip the write */
1175                                 rdev_dec_pending(rdev, conf->mddev);
1176                                 rdev = NULL;
1177                         }
1178                 }
1179
1180                 if (rdev) {
1181                         if (s->syncing || s->expanding || s->expanded
1182                             || s->replacing)
1183                                 md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
1184
1185                         set_bit(STRIPE_IO_STARTED, &sh->state);
1186
1187                         bio_init(bi, rdev->bdev, &dev->vec, 1, op | op_flags);
1188                         bi->bi_end_io = op_is_write(op)
1189                                 ? raid5_end_write_request
1190                                 : raid5_end_read_request;
1191                         bi->bi_private = sh;
1192
1193                         pr_debug("%s: for %llu schedule op %d on disc %d\n",
1194                                 __func__, (unsigned long long)sh->sector,
1195                                 bi->bi_opf, i);
1196                         atomic_inc(&sh->count);
1197                         if (sh != head_sh)
1198                                 atomic_inc(&head_sh->count);
1199                         if (use_new_offset(conf, sh))
1200                                 bi->bi_iter.bi_sector = (sh->sector
1201                                                  + rdev->new_data_offset);
1202                         else
1203                                 bi->bi_iter.bi_sector = (sh->sector
1204                                                  + rdev->data_offset);
1205                         if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
1206                                 bi->bi_opf |= REQ_NOMERGE;
1207
1208                         if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1209                                 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1210
1211                         if (!op_is_write(op) &&
1212                             test_bit(R5_InJournal, &sh->dev[i].flags))
1213                                 /*
1214                                  * issuing read for a page in journal, this
1215                                  * must be preparing for prexor in rmw; read
1216                                  * the data into orig_page
1217                                  */
1218                                 sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
1219                         else
1220                                 sh->dev[i].vec.bv_page = sh->dev[i].page;
1221                         bi->bi_vcnt = 1;
1222                         bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
1223                         bi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
1224                         bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
1225                         /*
1226                          * If this is discard request, set bi_vcnt 0. We don't
1227                          * want to confuse SCSI because SCSI will replace payload
1228                          */
1229                         if (op == REQ_OP_DISCARD)
1230                                 bi->bi_vcnt = 0;
1231                         if (rrdev)
1232                                 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
1233
1234                         if (conf->mddev->gendisk)
1235                                 trace_block_bio_remap(bi,
1236                                                 disk_devt(conf->mddev->gendisk),
1237                                                 sh->dev[i].sector);
1238                         if (should_defer && op_is_write(op))
1239                                 bio_list_add(&pending_bios, bi);
1240                         else
1241                                 submit_bio_noacct(bi);
1242                 }
1243                 if (rrdev) {
1244                         if (s->syncing || s->expanding || s->expanded
1245                             || s->replacing)
1246                                 md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
1247
1248                         set_bit(STRIPE_IO_STARTED, &sh->state);
1249
1250                         bio_init(rbi, rrdev->bdev, &dev->rvec, 1, op | op_flags);
1251                         BUG_ON(!op_is_write(op));
1252                         rbi->bi_end_io = raid5_end_write_request;
1253                         rbi->bi_private = sh;
1254
1255                         pr_debug("%s: for %llu schedule op %d on "
1256                                  "replacement disc %d\n",
1257                                 __func__, (unsigned long long)sh->sector,
1258                                 rbi->bi_opf, i);
1259                         atomic_inc(&sh->count);
1260                         if (sh != head_sh)
1261                                 atomic_inc(&head_sh->count);
1262                         if (use_new_offset(conf, sh))
1263                                 rbi->bi_iter.bi_sector = (sh->sector
1264                                                   + rrdev->new_data_offset);
1265                         else
1266                                 rbi->bi_iter.bi_sector = (sh->sector
1267                                                   + rrdev->data_offset);
1268                         if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1269                                 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1270                         sh->dev[i].rvec.bv_page = sh->dev[i].page;
1271                         rbi->bi_vcnt = 1;
1272                         rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
1273                         rbi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
1274                         rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
1275                         /*
1276                          * If this is discard request, set bi_vcnt 0. We don't
1277                          * want to confuse SCSI because SCSI will replace payload
1278                          */
1279                         if (op == REQ_OP_DISCARD)
1280                                 rbi->bi_vcnt = 0;
1281                         if (conf->mddev->gendisk)
1282                                 trace_block_bio_remap(rbi,
1283                                                 disk_devt(conf->mddev->gendisk),
1284                                                 sh->dev[i].sector);
1285                         if (should_defer && op_is_write(op))
1286                                 bio_list_add(&pending_bios, rbi);
1287                         else
1288                                 submit_bio_noacct(rbi);
1289                 }
1290                 if (!rdev && !rrdev) {
1291                         if (op_is_write(op))
1292                                 set_bit(STRIPE_DEGRADED, &sh->state);
1293                         pr_debug("skip op %d on disc %d for sector %llu\n",
1294                                 bi->bi_opf, i, (unsigned long long)sh->sector);
1295                         clear_bit(R5_LOCKED, &sh->dev[i].flags);
1296                         set_bit(STRIPE_HANDLE, &sh->state);
1297                 }
1298
1299                 if (!head_sh->batch_head)
1300                         continue;
1301                 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1302                                       batch_list);
1303                 if (sh != head_sh)
1304                         goto again;
1305         }
1306
1307         if (should_defer && !bio_list_empty(&pending_bios))
1308                 defer_issue_bios(conf, head_sh->sector, &pending_bios);
1309 }
1310
1311 static struct dma_async_tx_descriptor *
1312 async_copy_data(int frombio, struct bio *bio, struct page **page,
1313         unsigned int poff, sector_t sector, struct dma_async_tx_descriptor *tx,
1314         struct stripe_head *sh, int no_skipcopy)
1315 {
1316         struct bio_vec bvl;
1317         struct bvec_iter iter;
1318         struct page *bio_page;
1319         int page_offset;
1320         struct async_submit_ctl submit;
1321         enum async_tx_flags flags = 0;
1322         struct r5conf *conf = sh->raid_conf;
1323
1324         if (bio->bi_iter.bi_sector >= sector)
1325                 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
1326         else
1327                 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
1328
1329         if (frombio)
1330                 flags |= ASYNC_TX_FENCE;
1331         init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
1332
1333         bio_for_each_segment(bvl, bio, iter) {
1334                 int len = bvl.bv_len;
1335                 int clen;
1336                 int b_offset = 0;
1337
1338                 if (page_offset < 0) {
1339                         b_offset = -page_offset;
1340                         page_offset += b_offset;
1341                         len -= b_offset;
1342                 }
1343
1344                 if (len > 0 && page_offset + len > RAID5_STRIPE_SIZE(conf))
1345                         clen = RAID5_STRIPE_SIZE(conf) - page_offset;
1346                 else
1347                         clen = len;
1348
1349                 if (clen > 0) {
1350                         b_offset += bvl.bv_offset;
1351                         bio_page = bvl.bv_page;
1352                         if (frombio) {
1353                                 if (conf->skip_copy &&
1354                                     b_offset == 0 && page_offset == 0 &&
1355                                     clen == RAID5_STRIPE_SIZE(conf) &&
1356                                     !no_skipcopy)
1357                                         *page = bio_page;
1358                                 else
1359                                         tx = async_memcpy(*page, bio_page, page_offset + poff,
1360                                                   b_offset, clen, &submit);
1361                         } else
1362                                 tx = async_memcpy(bio_page, *page, b_offset,
1363                                                   page_offset + poff, clen, &submit);
1364                 }
1365                 /* chain the operations */
1366                 submit.depend_tx = tx;
1367
1368                 if (clen < len) /* hit end of page */
1369                         break;
1370                 page_offset +=  len;
1371         }
1372
1373         return tx;
1374 }
1375
1376 static void ops_complete_biofill(void *stripe_head_ref)
1377 {
1378         struct stripe_head *sh = stripe_head_ref;
1379         int i;
1380         struct r5conf *conf = sh->raid_conf;
1381
1382         pr_debug("%s: stripe %llu\n", __func__,
1383                 (unsigned long long)sh->sector);
1384
1385         /* clear completed biofills */
1386         for (i = sh->disks; i--; ) {
1387                 struct r5dev *dev = &sh->dev[i];
1388
1389                 /* acknowledge completion of a biofill operation */
1390                 /* and check if we need to reply to a read request,
1391                  * new R5_Wantfill requests are held off until
1392                  * !STRIPE_BIOFILL_RUN
1393                  */
1394                 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
1395                         struct bio *rbi, *rbi2;
1396
1397                         BUG_ON(!dev->read);
1398                         rbi = dev->read;
1399                         dev->read = NULL;
1400                         while (rbi && rbi->bi_iter.bi_sector <
1401                                 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1402                                 rbi2 = r5_next_bio(conf, rbi, dev->sector);
1403                                 bio_endio(rbi);
1404                                 rbi = rbi2;
1405                         }
1406                 }
1407         }
1408         clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
1409
1410         set_bit(STRIPE_HANDLE, &sh->state);
1411         raid5_release_stripe(sh);
1412 }
1413
1414 static void ops_run_biofill(struct stripe_head *sh)
1415 {
1416         struct dma_async_tx_descriptor *tx = NULL;
1417         struct async_submit_ctl submit;
1418         int i;
1419         struct r5conf *conf = sh->raid_conf;
1420
1421         BUG_ON(sh->batch_head);
1422         pr_debug("%s: stripe %llu\n", __func__,
1423                 (unsigned long long)sh->sector);
1424
1425         for (i = sh->disks; i--; ) {
1426                 struct r5dev *dev = &sh->dev[i];
1427                 if (test_bit(R5_Wantfill, &dev->flags)) {
1428                         struct bio *rbi;
1429                         spin_lock_irq(&sh->stripe_lock);
1430                         dev->read = rbi = dev->toread;
1431                         dev->toread = NULL;
1432                         spin_unlock_irq(&sh->stripe_lock);
1433                         while (rbi && rbi->bi_iter.bi_sector <
1434                                 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1435                                 tx = async_copy_data(0, rbi, &dev->page,
1436                                                      dev->offset,
1437                                                      dev->sector, tx, sh, 0);
1438                                 rbi = r5_next_bio(conf, rbi, dev->sector);
1439                         }
1440                 }
1441         }
1442
1443         atomic_inc(&sh->count);
1444         init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
1445         async_trigger_callback(&submit);
1446 }
1447
1448 static void mark_target_uptodate(struct stripe_head *sh, int target)
1449 {
1450         struct r5dev *tgt;
1451
1452         if (target < 0)
1453                 return;
1454
1455         tgt = &sh->dev[target];
1456         set_bit(R5_UPTODATE, &tgt->flags);
1457         BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1458         clear_bit(R5_Wantcompute, &tgt->flags);
1459 }
1460
1461 static void ops_complete_compute(void *stripe_head_ref)
1462 {
1463         struct stripe_head *sh = stripe_head_ref;
1464
1465         pr_debug("%s: stripe %llu\n", __func__,
1466                 (unsigned long long)sh->sector);
1467
1468         /* mark the computed target(s) as uptodate */
1469         mark_target_uptodate(sh, sh->ops.target);
1470         mark_target_uptodate(sh, sh->ops.target2);
1471
1472         clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
1473         if (sh->check_state == check_state_compute_run)
1474                 sh->check_state = check_state_compute_result;
1475         set_bit(STRIPE_HANDLE, &sh->state);
1476         raid5_release_stripe(sh);
1477 }
1478
1479 /* return a pointer to the address conversion region of the scribble buffer */
1480 static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
1481 {
1482         return percpu->scribble + i * percpu->scribble_obj_size;
1483 }
1484
1485 /* return a pointer to the address conversion region of the scribble buffer */
1486 static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1487                                  struct raid5_percpu *percpu, int i)
1488 {
1489         return (void *) (to_addr_page(percpu, i) + sh->disks + 2);
1490 }
1491
1492 /*
1493  * Return a pointer to record offset address.
1494  */
1495 static unsigned int *
1496 to_addr_offs(struct stripe_head *sh, struct raid5_percpu *percpu)
1497 {
1498         return (unsigned int *) (to_addr_conv(sh, percpu, 0) + sh->disks + 2);
1499 }
1500
1501 static struct dma_async_tx_descriptor *
1502 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
1503 {
1504         int disks = sh->disks;
1505         struct page **xor_srcs = to_addr_page(percpu, 0);
1506         unsigned int *off_srcs = to_addr_offs(sh, percpu);
1507         int target = sh->ops.target;
1508         struct r5dev *tgt = &sh->dev[target];
1509         struct page *xor_dest = tgt->page;
1510         unsigned int off_dest = tgt->offset;
1511         int count = 0;
1512         struct dma_async_tx_descriptor *tx;
1513         struct async_submit_ctl submit;
1514         int i;
1515
1516         BUG_ON(sh->batch_head);
1517
1518         pr_debug("%s: stripe %llu block: %d\n",
1519                 __func__, (unsigned long long)sh->sector, target);
1520         BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1521
1522         for (i = disks; i--; ) {
1523                 if (i != target) {
1524                         off_srcs[count] = sh->dev[i].offset;
1525                         xor_srcs[count++] = sh->dev[i].page;
1526                 }
1527         }
1528
1529         atomic_inc(&sh->count);
1530
1531         init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
1532                           ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
1533         if (unlikely(count == 1))
1534                 tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
1535                                 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1536         else
1537                 tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
1538                                 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1539
1540         return tx;
1541 }
1542
1543 /* set_syndrome_sources - populate source buffers for gen_syndrome
1544  * @srcs - (struct page *) array of size sh->disks
1545  * @offs - (unsigned int) array of offset for each page
1546  * @sh - stripe_head to parse
1547  *
1548  * Populates srcs in proper layout order for the stripe and returns the
1549  * 'count' of sources to be used in a call to async_gen_syndrome.  The P
1550  * destination buffer is recorded in srcs[count] and the Q destination
1551  * is recorded in srcs[count+1]].
1552  */
1553 static int set_syndrome_sources(struct page **srcs,
1554                                 unsigned int *offs,
1555                                 struct stripe_head *sh,
1556                                 int srctype)
1557 {
1558         int disks = sh->disks;
1559         int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
1560         int d0_idx = raid6_d0(sh);
1561         int count;
1562         int i;
1563
1564         for (i = 0; i < disks; i++)
1565                 srcs[i] = NULL;
1566
1567         count = 0;
1568         i = d0_idx;
1569         do {
1570                 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1571                 struct r5dev *dev = &sh->dev[i];
1572
1573                 if (i == sh->qd_idx || i == sh->pd_idx ||
1574                     (srctype == SYNDROME_SRC_ALL) ||
1575                     (srctype == SYNDROME_SRC_WANT_DRAIN &&
1576                      (test_bit(R5_Wantdrain, &dev->flags) ||
1577                       test_bit(R5_InJournal, &dev->flags))) ||
1578                     (srctype == SYNDROME_SRC_WRITTEN &&
1579                      (dev->written ||
1580                       test_bit(R5_InJournal, &dev->flags)))) {
1581                         if (test_bit(R5_InJournal, &dev->flags))
1582                                 srcs[slot] = sh->dev[i].orig_page;
1583                         else
1584                                 srcs[slot] = sh->dev[i].page;
1585                         /*
1586                          * For R5_InJournal, PAGE_SIZE must be 4KB and will
1587                          * not shared page. In that case, dev[i].offset
1588                          * is 0.
1589                          */
1590                         offs[slot] = sh->dev[i].offset;
1591                 }
1592                 i = raid6_next_disk(i, disks);
1593         } while (i != d0_idx);
1594
1595         return syndrome_disks;
1596 }
1597
1598 static struct dma_async_tx_descriptor *
1599 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
1600 {
1601         int disks = sh->disks;
1602         struct page **blocks = to_addr_page(percpu, 0);
1603         unsigned int *offs = to_addr_offs(sh, percpu);
1604         int target;
1605         int qd_idx = sh->qd_idx;
1606         struct dma_async_tx_descriptor *tx;
1607         struct async_submit_ctl submit;
1608         struct r5dev *tgt;
1609         struct page *dest;
1610         unsigned int dest_off;
1611         int i;
1612         int count;
1613
1614         BUG_ON(sh->batch_head);
1615         if (sh->ops.target < 0)
1616                 target = sh->ops.target2;
1617         else if (sh->ops.target2 < 0)
1618                 target = sh->ops.target;
1619         else
1620                 /* we should only have one valid target */
1621                 BUG();
1622         BUG_ON(target < 0);
1623         pr_debug("%s: stripe %llu block: %d\n",
1624                 __func__, (unsigned long long)sh->sector, target);
1625
1626         tgt = &sh->dev[target];
1627         BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1628         dest = tgt->page;
1629         dest_off = tgt->offset;
1630
1631         atomic_inc(&sh->count);
1632
1633         if (target == qd_idx) {
1634                 count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL);
1635                 blocks[count] = NULL; /* regenerating p is not necessary */
1636                 BUG_ON(blocks[count+1] != dest); /* q should already be set */
1637                 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1638                                   ops_complete_compute, sh,
1639                                   to_addr_conv(sh, percpu, 0));
1640                 tx = async_gen_syndrome(blocks, offs, count+2,
1641                                 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1642         } else {
1643                 /* Compute any data- or p-drive using XOR */
1644                 count = 0;
1645                 for (i = disks; i-- ; ) {
1646                         if (i == target || i == qd_idx)
1647                                 continue;
1648                         offs[count] = sh->dev[i].offset;
1649                         blocks[count++] = sh->dev[i].page;
1650                 }
1651
1652                 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1653                                   NULL, ops_complete_compute, sh,
1654                                   to_addr_conv(sh, percpu, 0));
1655                 tx = async_xor_offs(dest, dest_off, blocks, offs, count,
1656                                 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1657         }
1658
1659         return tx;
1660 }
1661
1662 static struct dma_async_tx_descriptor *
1663 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
1664 {
1665         int i, count, disks = sh->disks;
1666         int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1667         int d0_idx = raid6_d0(sh);
1668         int faila = -1, failb = -1;
1669         int target = sh->ops.target;
1670         int target2 = sh->ops.target2;
1671         struct r5dev *tgt = &sh->dev[target];
1672         struct r5dev *tgt2 = &sh->dev[target2];
1673         struct dma_async_tx_descriptor *tx;
1674         struct page **blocks = to_addr_page(percpu, 0);
1675         unsigned int *offs = to_addr_offs(sh, percpu);
1676         struct async_submit_ctl submit;
1677
1678         BUG_ON(sh->batch_head);
1679         pr_debug("%s: stripe %llu block1: %d block2: %d\n",
1680                  __func__, (unsigned long long)sh->sector, target, target2);
1681         BUG_ON(target < 0 || target2 < 0);
1682         BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1683         BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
1684
1685         /* we need to open-code set_syndrome_sources to handle the
1686          * slot number conversion for 'faila' and 'failb'
1687          */
1688         for (i = 0; i < disks ; i++) {
1689                 offs[i] = 0;
1690                 blocks[i] = NULL;
1691         }
1692         count = 0;
1693         i = d0_idx;
1694         do {
1695                 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1696
1697                 offs[slot] = sh->dev[i].offset;
1698                 blocks[slot] = sh->dev[i].page;
1699
1700                 if (i == target)
1701                         faila = slot;
1702                 if (i == target2)
1703                         failb = slot;
1704                 i = raid6_next_disk(i, disks);
1705         } while (i != d0_idx);
1706
1707         BUG_ON(faila == failb);
1708         if (failb < faila)
1709                 swap(faila, failb);
1710         pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
1711                  __func__, (unsigned long long)sh->sector, faila, failb);
1712
1713         atomic_inc(&sh->count);
1714
1715         if (failb == syndrome_disks+1) {
1716                 /* Q disk is one of the missing disks */
1717                 if (faila == syndrome_disks) {
1718                         /* Missing P+Q, just recompute */
1719                         init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1720                                           ops_complete_compute, sh,
1721                                           to_addr_conv(sh, percpu, 0));
1722                         return async_gen_syndrome(blocks, offs, syndrome_disks+2,
1723                                                   RAID5_STRIPE_SIZE(sh->raid_conf),
1724                                                   &submit);
1725                 } else {
1726                         struct page *dest;
1727                         unsigned int dest_off;
1728                         int data_target;
1729                         int qd_idx = sh->qd_idx;
1730
1731                         /* Missing D+Q: recompute D from P, then recompute Q */
1732                         if (target == qd_idx)
1733                                 data_target = target2;
1734                         else
1735                                 data_target = target;
1736
1737                         count = 0;
1738                         for (i = disks; i-- ; ) {
1739                                 if (i == data_target || i == qd_idx)
1740                                         continue;
1741                                 offs[count] = sh->dev[i].offset;
1742                                 blocks[count++] = sh->dev[i].page;
1743                         }
1744                         dest = sh->dev[data_target].page;
1745                         dest_off = sh->dev[data_target].offset;
1746                         init_async_submit(&submit,
1747                                           ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1748                                           NULL, NULL, NULL,
1749                                           to_addr_conv(sh, percpu, 0));
1750                         tx = async_xor_offs(dest, dest_off, blocks, offs, count,
1751                                        RAID5_STRIPE_SIZE(sh->raid_conf),
1752                                        &submit);
1753
1754                         count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL);
1755                         init_async_submit(&submit, ASYNC_TX_FENCE, tx,
1756                                           ops_complete_compute, sh,
1757                                           to_addr_conv(sh, percpu, 0));
1758                         return async_gen_syndrome(blocks, offs, count+2,
1759                                                   RAID5_STRIPE_SIZE(sh->raid_conf),
1760                                                   &submit);
1761                 }
1762         } else {
1763                 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1764                                   ops_complete_compute, sh,
1765                                   to_addr_conv(sh, percpu, 0));
1766                 if (failb == syndrome_disks) {
1767                         /* We're missing D+P. */
1768                         return async_raid6_datap_recov(syndrome_disks+2,
1769                                                 RAID5_STRIPE_SIZE(sh->raid_conf),
1770                                                 faila,
1771                                                 blocks, offs, &submit);
1772                 } else {
1773                         /* We're missing D+D. */
1774                         return async_raid6_2data_recov(syndrome_disks+2,
1775                                                 RAID5_STRIPE_SIZE(sh->raid_conf),
1776                                                 faila, failb,
1777                                                 blocks, offs, &submit);
1778                 }
1779         }
1780 }
1781
1782 static void ops_complete_prexor(void *stripe_head_ref)
1783 {
1784         struct stripe_head *sh = stripe_head_ref;
1785
1786         pr_debug("%s: stripe %llu\n", __func__,
1787                 (unsigned long long)sh->sector);
1788
1789         if (r5c_is_writeback(sh->raid_conf->log))
1790                 /*
1791                  * raid5-cache write back uses orig_page during prexor.
1792                  * After prexor, it is time to free orig_page
1793                  */
1794                 r5c_release_extra_page(sh);
1795 }
1796
1797 static struct dma_async_tx_descriptor *
1798 ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
1799                 struct dma_async_tx_descriptor *tx)
1800 {
1801         int disks = sh->disks;
1802         struct page **xor_srcs = to_addr_page(percpu, 0);
1803         unsigned int *off_srcs = to_addr_offs(sh, percpu);
1804         int count = 0, pd_idx = sh->pd_idx, i;
1805         struct async_submit_ctl submit;
1806
1807         /* existing parity data subtracted */
1808         unsigned int off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
1809         struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1810
1811         BUG_ON(sh->batch_head);
1812         pr_debug("%s: stripe %llu\n", __func__,
1813                 (unsigned long long)sh->sector);
1814
1815         for (i = disks; i--; ) {
1816                 struct r5dev *dev = &sh->dev[i];
1817                 /* Only process blocks that are known to be uptodate */
1818                 if (test_bit(R5_InJournal, &dev->flags)) {
1819                         /*
1820                          * For this case, PAGE_SIZE must be equal to 4KB and
1821                          * page offset is zero.
1822                          */
1823                         off_srcs[count] = dev->offset;
1824                         xor_srcs[count++] = dev->orig_page;
1825                 } else if (test_bit(R5_Wantdrain, &dev->flags)) {
1826                         off_srcs[count] = dev->offset;
1827                         xor_srcs[count++] = dev->page;
1828                 }
1829         }
1830
1831         init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1832                           ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1833         tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
1834                         RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1835
1836         return tx;
1837 }
1838
1839 static struct dma_async_tx_descriptor *
1840 ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
1841                 struct dma_async_tx_descriptor *tx)
1842 {
1843         struct page **blocks = to_addr_page(percpu, 0);
1844         unsigned int *offs = to_addr_offs(sh, percpu);
1845         int count;
1846         struct async_submit_ctl submit;
1847
1848         pr_debug("%s: stripe %llu\n", __func__,
1849                 (unsigned long long)sh->sector);
1850
1851         count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_WANT_DRAIN);
1852
1853         init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
1854                           ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1855         tx = async_gen_syndrome(blocks, offs, count+2,
1856                         RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1857
1858         return tx;
1859 }
1860
1861 static struct dma_async_tx_descriptor *
1862 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1863 {
1864         struct r5conf *conf = sh->raid_conf;
1865         int disks = sh->disks;
1866         int i;
1867         struct stripe_head *head_sh = sh;
1868
1869         pr_debug("%s: stripe %llu\n", __func__,
1870                 (unsigned long long)sh->sector);
1871
1872         for (i = disks; i--; ) {
1873                 struct r5dev *dev;
1874                 struct bio *chosen;
1875
1876                 sh = head_sh;
1877                 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) {
1878                         struct bio *wbi;
1879
1880 again:
1881                         dev = &sh->dev[i];
1882                         /*
1883                          * clear R5_InJournal, so when rewriting a page in
1884                          * journal, it is not skipped by r5l_log_stripe()
1885                          */
1886                         clear_bit(R5_InJournal, &dev->flags);
1887                         spin_lock_irq(&sh->stripe_lock);
1888                         chosen = dev->towrite;
1889                         dev->towrite = NULL;
1890                         sh->overwrite_disks = 0;
1891                         BUG_ON(dev->written);
1892                         wbi = dev->written = chosen;
1893                         spin_unlock_irq(&sh->stripe_lock);
1894                         WARN_ON(dev->page != dev->orig_page);
1895
1896                         while (wbi && wbi->bi_iter.bi_sector <
1897                                 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1898                                 if (wbi->bi_opf & REQ_FUA)
1899                                         set_bit(R5_WantFUA, &dev->flags);
1900                                 if (wbi->bi_opf & REQ_SYNC)
1901                                         set_bit(R5_SyncIO, &dev->flags);
1902                                 if (bio_op(wbi) == REQ_OP_DISCARD)
1903                                         set_bit(R5_Discard, &dev->flags);
1904                                 else {
1905                                         tx = async_copy_data(1, wbi, &dev->page,
1906                                                              dev->offset,
1907                                                              dev->sector, tx, sh,
1908                                                              r5c_is_writeback(conf->log));
1909                                         if (dev->page != dev->orig_page &&
1910                                             !r5c_is_writeback(conf->log)) {
1911                                                 set_bit(R5_SkipCopy, &dev->flags);
1912                                                 clear_bit(R5_UPTODATE, &dev->flags);
1913                                                 clear_bit(R5_OVERWRITE, &dev->flags);
1914                                         }
1915                                 }
1916                                 wbi = r5_next_bio(conf, wbi, dev->sector);
1917                         }
1918
1919                         if (head_sh->batch_head) {
1920                                 sh = list_first_entry(&sh->batch_list,
1921                                                       struct stripe_head,
1922                                                       batch_list);
1923                                 if (sh == head_sh)
1924                                         continue;
1925                                 goto again;
1926                         }
1927                 }
1928         }
1929
1930         return tx;
1931 }
1932
1933 static void ops_complete_reconstruct(void *stripe_head_ref)
1934 {
1935         struct stripe_head *sh = stripe_head_ref;
1936         int disks = sh->disks;
1937         int pd_idx = sh->pd_idx;
1938         int qd_idx = sh->qd_idx;
1939         int i;
1940         bool fua = false, sync = false, discard = false;
1941
1942         pr_debug("%s: stripe %llu\n", __func__,
1943                 (unsigned long long)sh->sector);
1944
1945         for (i = disks; i--; ) {
1946                 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1947                 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
1948                 discard |= test_bit(R5_Discard, &sh->dev[i].flags);
1949         }
1950
1951         for (i = disks; i--; ) {
1952                 struct r5dev *dev = &sh->dev[i];
1953
1954                 if (dev->written || i == pd_idx || i == qd_idx) {
1955                         if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) {
1956                                 set_bit(R5_UPTODATE, &dev->flags);
1957                                 if (test_bit(STRIPE_EXPAND_READY, &sh->state))
1958                                         set_bit(R5_Expanded, &dev->flags);
1959                         }
1960                         if (fua)
1961                                 set_bit(R5_WantFUA, &dev->flags);
1962                         if (sync)
1963                                 set_bit(R5_SyncIO, &dev->flags);
1964                 }
1965         }
1966
1967         if (sh->reconstruct_state == reconstruct_state_drain_run)
1968                 sh->reconstruct_state = reconstruct_state_drain_result;
1969         else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
1970                 sh->reconstruct_state = reconstruct_state_prexor_drain_result;
1971         else {
1972                 BUG_ON(sh->reconstruct_state != reconstruct_state_run);
1973                 sh->reconstruct_state = reconstruct_state_result;
1974         }
1975
1976         set_bit(STRIPE_HANDLE, &sh->state);
1977         raid5_release_stripe(sh);
1978 }
1979
1980 static void
1981 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1982                      struct dma_async_tx_descriptor *tx)
1983 {
1984         int disks = sh->disks;
1985         struct page **xor_srcs;
1986         unsigned int *off_srcs;
1987         struct async_submit_ctl submit;
1988         int count, pd_idx = sh->pd_idx, i;
1989         struct page *xor_dest;
1990         unsigned int off_dest;
1991         int prexor = 0;
1992         unsigned long flags;
1993         int j = 0;
1994         struct stripe_head *head_sh = sh;
1995         int last_stripe;
1996
1997         pr_debug("%s: stripe %llu\n", __func__,
1998                 (unsigned long long)sh->sector);
1999
2000         for (i = 0; i < sh->disks; i++) {
2001                 if (pd_idx == i)
2002                         continue;
2003                 if (!test_bit(R5_Discard, &sh->dev[i].flags))
2004                         break;
2005         }
2006         if (i >= sh->disks) {
2007                 atomic_inc(&sh->count);
2008                 set_bit(R5_Discard, &sh->dev[pd_idx].flags);
2009                 ops_complete_reconstruct(sh);
2010                 return;
2011         }
2012 again:
2013         count = 0;
2014         xor_srcs = to_addr_page(percpu, j);
2015         off_srcs = to_addr_offs(sh, percpu);
2016         /* check if prexor is active which means only process blocks
2017          * that are part of a read-modify-write (written)
2018          */
2019         if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
2020                 prexor = 1;
2021                 off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
2022                 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
2023                 for (i = disks; i--; ) {
2024                         struct r5dev *dev = &sh->dev[i];
2025                         if (head_sh->dev[i].written ||
2026                             test_bit(R5_InJournal, &head_sh->dev[i].flags)) {
2027                                 off_srcs[count] = dev->offset;
2028                                 xor_srcs[count++] = dev->page;
2029                         }
2030                 }
2031         } else {
2032                 xor_dest = sh->dev[pd_idx].page;
2033                 off_dest = sh->dev[pd_idx].offset;
2034                 for (i = disks; i--; ) {
2035                         struct r5dev *dev = &sh->dev[i];
2036                         if (i != pd_idx) {
2037                                 off_srcs[count] = dev->offset;
2038                                 xor_srcs[count++] = dev->page;
2039                         }
2040                 }
2041         }
2042
2043         /* 1/ if we prexor'd then the dest is reused as a source
2044          * 2/ if we did not prexor then we are redoing the parity
2045          * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
2046          * for the synchronous xor case
2047          */
2048         last_stripe = !head_sh->batch_head ||
2049                 list_first_entry(&sh->batch_list,
2050                                  struct stripe_head, batch_list) == head_sh;
2051         if (last_stripe) {
2052                 flags = ASYNC_TX_ACK |
2053                         (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
2054
2055                 atomic_inc(&head_sh->count);
2056                 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh,
2057                                   to_addr_conv(sh, percpu, j));
2058         } else {
2059                 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
2060                 init_async_submit(&submit, flags, tx, NULL, NULL,
2061                                   to_addr_conv(sh, percpu, j));
2062         }
2063
2064         if (unlikely(count == 1))
2065                 tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
2066                                 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
2067         else
2068                 tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
2069                                 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
2070         if (!last_stripe) {
2071                 j++;
2072                 sh = list_first_entry(&sh->batch_list, struct stripe_head,
2073                                       batch_list);
2074                 goto again;
2075         }
2076 }
2077
2078 static void
2079 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
2080                      struct dma_async_tx_descriptor *tx)
2081 {
2082         struct async_submit_ctl submit;
2083         struct page **blocks;
2084         unsigned int *offs;
2085         int count, i, j = 0;
2086         struct stripe_head *head_sh = sh;
2087         int last_stripe;
2088         int synflags;
2089         unsigned long txflags;
2090
2091         pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
2092
2093         for (i = 0; i < sh->disks; i++) {
2094                 if (sh->pd_idx == i || sh->qd_idx == i)
2095                         continue;
2096                 if (!test_bit(R5_Discard, &sh->dev[i].flags))
2097                         break;
2098         }
2099         if (i >= sh->disks) {
2100                 atomic_inc(&sh->count);
2101                 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
2102                 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
2103                 ops_complete_reconstruct(sh);
2104                 return;
2105         }
2106
2107 again:
2108         blocks = to_addr_page(percpu, j);
2109         offs = to_addr_offs(sh, percpu);
2110
2111         if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
2112                 synflags = SYNDROME_SRC_WRITTEN;
2113                 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
2114         } else {
2115                 synflags = SYNDROME_SRC_ALL;
2116                 txflags = ASYNC_TX_ACK;
2117         }
2118
2119         count = set_syndrome_sources(blocks, offs, sh, synflags);
2120         last_stripe = !head_sh->batch_head ||
2121                 list_first_entry(&sh->batch_list,
2122                                  struct stripe_head, batch_list) == head_sh;
2123
2124         if (last_stripe) {
2125                 atomic_inc(&head_sh->count);
2126                 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
2127                                   head_sh, to_addr_conv(sh, percpu, j));
2128         } else
2129                 init_async_submit(&submit, 0, tx, NULL, NULL,
2130                                   to_addr_conv(sh, percpu, j));
2131         tx = async_gen_syndrome(blocks, offs, count+2,
2132                         RAID5_STRIPE_SIZE(sh->raid_conf),  &submit);
2133         if (!last_stripe) {
2134                 j++;
2135                 sh = list_first_entry(&sh->batch_list, struct stripe_head,
2136                                       batch_list);
2137                 goto again;
2138         }
2139 }
2140
2141 static void ops_complete_check(void *stripe_head_ref)
2142 {
2143         struct stripe_head *sh = stripe_head_ref;
2144
2145         pr_debug("%s: stripe %llu\n", __func__,
2146                 (unsigned long long)sh->sector);
2147
2148         sh->check_state = check_state_check_result;
2149         set_bit(STRIPE_HANDLE, &sh->state);
2150         raid5_release_stripe(sh);
2151 }
2152
2153 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
2154 {
2155         int disks = sh->disks;
2156         int pd_idx = sh->pd_idx;
2157         int qd_idx = sh->qd_idx;
2158         struct page *xor_dest;
2159         unsigned int off_dest;
2160         struct page **xor_srcs = to_addr_page(percpu, 0);
2161         unsigned int *off_srcs = to_addr_offs(sh, percpu);
2162         struct dma_async_tx_descriptor *tx;
2163         struct async_submit_ctl submit;
2164         int count;
2165         int i;
2166
2167         pr_debug("%s: stripe %llu\n", __func__,
2168                 (unsigned long long)sh->sector);
2169
2170         BUG_ON(sh->batch_head);
2171         count = 0;
2172         xor_dest = sh->dev[pd_idx].page;
2173         off_dest = sh->dev[pd_idx].offset;
2174         off_srcs[count] = off_dest;
2175         xor_srcs[count++] = xor_dest;
2176         for (i = disks; i--; ) {
2177                 if (i == pd_idx || i == qd_idx)
2178                         continue;
2179                 off_srcs[count] = sh->dev[i].offset;
2180                 xor_srcs[count++] = sh->dev[i].page;
2181         }
2182
2183         init_async_submit(&submit, 0, NULL, NULL, NULL,
2184                           to_addr_conv(sh, percpu, 0));
2185         tx = async_xor_val_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
2186                            RAID5_STRIPE_SIZE(sh->raid_conf),
2187                            &sh->ops.zero_sum_result, &submit);
2188
2189         atomic_inc(&sh->count);
2190         init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
2191         tx = async_trigger_callback(&submit);
2192 }
2193
2194 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
2195 {
2196         struct page **srcs = to_addr_page(percpu, 0);
2197         unsigned int *offs = to_addr_offs(sh, percpu);
2198         struct async_submit_ctl submit;
2199         int count;
2200
2201         pr_debug("%s: stripe %llu checkp: %d\n", __func__,
2202                 (unsigned long long)sh->sector, checkp);
2203
2204         BUG_ON(sh->batch_head);
2205         count = set_syndrome_sources(srcs, offs, sh, SYNDROME_SRC_ALL);
2206         if (!checkp)
2207                 srcs[count] = NULL;
2208
2209         atomic_inc(&sh->count);
2210         init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
2211                           sh, to_addr_conv(sh, percpu, 0));
2212         async_syndrome_val(srcs, offs, count+2,
2213                            RAID5_STRIPE_SIZE(sh->raid_conf),
2214                            &sh->ops.zero_sum_result, percpu->spare_page, 0, &submit);
2215 }
2216
2217 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
2218 {
2219         int overlap_clear = 0, i, disks = sh->disks;
2220         struct dma_async_tx_descriptor *tx = NULL;
2221         struct r5conf *conf = sh->raid_conf;
2222         int level = conf->level;
2223         struct raid5_percpu *percpu;
2224
2225         local_lock(&conf->percpu->lock);
2226         percpu = this_cpu_ptr(conf->percpu);
2227         if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
2228                 ops_run_biofill(sh);
2229                 overlap_clear++;
2230         }
2231
2232         if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
2233                 if (level < 6)
2234                         tx = ops_run_compute5(sh, percpu);
2235                 else {
2236                         if (sh->ops.target2 < 0 || sh->ops.target < 0)
2237                                 tx = ops_run_compute6_1(sh, percpu);
2238                         else
2239                                 tx = ops_run_compute6_2(sh, percpu);
2240                 }
2241                 /* terminate the chain if reconstruct is not set to be run */
2242                 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
2243                         async_tx_ack(tx);
2244         }
2245
2246         if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
2247                 if (level < 6)
2248                         tx = ops_run_prexor5(sh, percpu, tx);
2249                 else
2250                         tx = ops_run_prexor6(sh, percpu, tx);
2251         }
2252
2253         if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
2254                 tx = ops_run_partial_parity(sh, percpu, tx);
2255
2256         if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
2257                 tx = ops_run_biodrain(sh, tx);
2258                 overlap_clear++;
2259         }
2260
2261         if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
2262                 if (level < 6)
2263                         ops_run_reconstruct5(sh, percpu, tx);
2264                 else
2265                         ops_run_reconstruct6(sh, percpu, tx);
2266         }
2267
2268         if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
2269                 if (sh->check_state == check_state_run)
2270                         ops_run_check_p(sh, percpu);
2271                 else if (sh->check_state == check_state_run_q)
2272                         ops_run_check_pq(sh, percpu, 0);
2273                 else if (sh->check_state == check_state_run_pq)
2274                         ops_run_check_pq(sh, percpu, 1);
2275                 else
2276                         BUG();
2277         }
2278
2279         if (overlap_clear && !sh->batch_head) {
2280                 for (i = disks; i--; ) {
2281                         struct r5dev *dev = &sh->dev[i];
2282                         if (test_and_clear_bit(R5_Overlap, &dev->flags))
2283                                 wake_up(&sh->raid_conf->wait_for_overlap);
2284                 }
2285         }
2286         local_unlock(&conf->percpu->lock);
2287 }
2288
2289 static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
2290 {
2291 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2292         kfree(sh->pages);
2293 #endif
2294         if (sh->ppl_page)
2295                 __free_page(sh->ppl_page);
2296         kmem_cache_free(sc, sh);
2297 }
2298
2299 static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
2300         int disks, struct r5conf *conf)
2301 {
2302         struct stripe_head *sh;
2303
2304         sh = kmem_cache_zalloc(sc, gfp);
2305         if (sh) {
2306                 spin_lock_init(&sh->stripe_lock);
2307                 spin_lock_init(&sh->batch_lock);
2308                 INIT_LIST_HEAD(&sh->batch_list);
2309                 INIT_LIST_HEAD(&sh->lru);
2310                 INIT_LIST_HEAD(&sh->r5c);
2311                 INIT_LIST_HEAD(&sh->log_list);
2312                 atomic_set(&sh->count, 1);
2313                 sh->raid_conf = conf;
2314                 sh->log_start = MaxSector;
2315
2316                 if (raid5_has_ppl(conf)) {
2317                         sh->ppl_page = alloc_page(gfp);
2318                         if (!sh->ppl_page) {
2319                                 free_stripe(sc, sh);
2320                                 return NULL;
2321                         }
2322                 }
2323 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2324                 if (init_stripe_shared_pages(sh, conf, disks)) {
2325                         free_stripe(sc, sh);
2326                         return NULL;
2327                 }
2328 #endif
2329         }
2330         return sh;
2331 }
2332 static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
2333 {
2334         struct stripe_head *sh;
2335
2336         sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf);
2337         if (!sh)
2338                 return 0;
2339
2340         if (grow_buffers(sh, gfp)) {
2341                 shrink_buffers(sh);
2342                 free_stripe(conf->slab_cache, sh);
2343                 return 0;
2344         }
2345         sh->hash_lock_index =
2346                 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
2347         /* we just created an active stripe so... */
2348         atomic_inc(&conf->active_stripes);
2349
2350         raid5_release_stripe(sh);
2351         conf->max_nr_stripes++;
2352         return 1;
2353 }
2354
2355 static int grow_stripes(struct r5conf *conf, int num)
2356 {
2357         struct kmem_cache *sc;
2358         size_t namelen = sizeof(conf->cache_name[0]);
2359         int devs = max(conf->raid_disks, conf->previous_raid_disks);
2360
2361         if (conf->mddev->gendisk)
2362                 snprintf(conf->cache_name[0], namelen,
2363                         "raid%d-%s", conf->level, mdname(conf->mddev));
2364         else
2365                 snprintf(conf->cache_name[0], namelen,
2366                         "raid%d-%p", conf->level, conf->mddev);
2367         snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]);
2368
2369         conf->active_name = 0;
2370         sc = kmem_cache_create(conf->cache_name[conf->active_name],
2371                                sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
2372                                0, 0, NULL);
2373         if (!sc)
2374                 return 1;
2375         conf->slab_cache = sc;
2376         conf->pool_size = devs;
2377         while (num--)
2378                 if (!grow_one_stripe(conf, GFP_KERNEL))
2379                         return 1;
2380
2381         return 0;
2382 }
2383
2384 /**
2385  * scribble_alloc - allocate percpu scribble buffer for required size
2386  *                  of the scribble region
2387  * @percpu: from for_each_present_cpu() of the caller
2388  * @num: total number of disks in the array
2389  * @cnt: scribble objs count for required size of the scribble region
2390  *
2391  * The scribble buffer size must be enough to contain:
2392  * 1/ a struct page pointer for each device in the array +2
2393  * 2/ room to convert each entry in (1) to its corresponding dma
2394  *    (dma_map_page()) or page (page_address()) address.
2395  *
2396  * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
2397  * calculate over all devices (not just the data blocks), using zeros in place
2398  * of the P and Q blocks.
2399  */
2400 static int scribble_alloc(struct raid5_percpu *percpu,
2401                           int num, int cnt)
2402 {
2403         size_t obj_size =
2404                 sizeof(struct page *) * (num + 2) +
2405                 sizeof(addr_conv_t) * (num + 2) +
2406                 sizeof(unsigned int) * (num + 2);
2407         void *scribble;
2408
2409         /*
2410          * If here is in raid array suspend context, it is in memalloc noio
2411          * context as well, there is no potential recursive memory reclaim
2412          * I/Os with the GFP_KERNEL flag.
2413          */
2414         scribble = kvmalloc_array(cnt, obj_size, GFP_KERNEL);
2415         if (!scribble)
2416                 return -ENOMEM;
2417
2418         kvfree(percpu->scribble);
2419
2420         percpu->scribble = scribble;
2421         percpu->scribble_obj_size = obj_size;
2422         return 0;
2423 }
2424
2425 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
2426 {
2427         unsigned long cpu;
2428         int err = 0;
2429
2430         /*
2431          * Never shrink. And mddev_suspend() could deadlock if this is called
2432          * from raid5d. In that case, scribble_disks and scribble_sectors
2433          * should equal to new_disks and new_sectors
2434          */
2435         if (conf->scribble_disks >= new_disks &&
2436             conf->scribble_sectors >= new_sectors)
2437                 return 0;
2438         mddev_suspend(conf->mddev);
2439         cpus_read_lock();
2440
2441         for_each_present_cpu(cpu) {
2442                 struct raid5_percpu *percpu;
2443
2444                 percpu = per_cpu_ptr(conf->percpu, cpu);
2445                 err = scribble_alloc(percpu, new_disks,
2446                                      new_sectors / RAID5_STRIPE_SECTORS(conf));
2447                 if (err)
2448                         break;
2449         }
2450
2451         cpus_read_unlock();
2452         mddev_resume(conf->mddev);
2453         if (!err) {
2454                 conf->scribble_disks = new_disks;
2455                 conf->scribble_sectors = new_sectors;
2456         }
2457         return err;
2458 }
2459
2460 static int resize_stripes(struct r5conf *conf, int newsize)
2461 {
2462         /* Make all the stripes able to hold 'newsize' devices.
2463          * New slots in each stripe get 'page' set to a new page.
2464          *
2465          * This happens in stages:
2466          * 1/ create a new kmem_cache and allocate the required number of
2467          *    stripe_heads.
2468          * 2/ gather all the old stripe_heads and transfer the pages across
2469          *    to the new stripe_heads.  This will have the side effect of
2470          *    freezing the array as once all stripe_heads have been collected,
2471          *    no IO will be possible.  Old stripe heads are freed once their
2472          *    pages have been transferred over, and the old kmem_cache is
2473          *    freed when all stripes are done.
2474          * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
2475          *    we simple return a failure status - no need to clean anything up.
2476          * 4/ allocate new pages for the new slots in the new stripe_heads.
2477          *    If this fails, we don't bother trying the shrink the
2478          *    stripe_heads down again, we just leave them as they are.
2479          *    As each stripe_head is processed the new one is released into
2480          *    active service.
2481          *
2482          * Once step2 is started, we cannot afford to wait for a write,
2483          * so we use GFP_NOIO allocations.
2484          */
2485         struct stripe_head *osh, *nsh;
2486         LIST_HEAD(newstripes);
2487         struct disk_info *ndisks;
2488         int err = 0;
2489         struct kmem_cache *sc;
2490         int i;
2491         int hash, cnt;
2492
2493         md_allow_write(conf->mddev);
2494
2495         /* Step 1 */
2496         sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
2497                                sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
2498                                0, 0, NULL);
2499         if (!sc)
2500                 return -ENOMEM;
2501
2502         /* Need to ensure auto-resizing doesn't interfere */
2503         mutex_lock(&conf->cache_size_mutex);
2504
2505         for (i = conf->max_nr_stripes; i; i--) {
2506                 nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf);
2507                 if (!nsh)
2508                         break;
2509
2510                 list_add(&nsh->lru, &newstripes);
2511         }
2512         if (i) {
2513                 /* didn't get enough, give up */
2514                 while (!list_empty(&newstripes)) {
2515                         nsh = list_entry(newstripes.next, struct stripe_head, lru);
2516                         list_del(&nsh->lru);
2517                         free_stripe(sc, nsh);
2518                 }
2519                 kmem_cache_destroy(sc);
2520                 mutex_unlock(&conf->cache_size_mutex);
2521                 return -ENOMEM;
2522         }
2523         /* Step 2 - Must use GFP_NOIO now.
2524          * OK, we have enough stripes, start collecting inactive
2525          * stripes and copying them over
2526          */
2527         hash = 0;
2528         cnt = 0;
2529         list_for_each_entry(nsh, &newstripes, lru) {
2530                 lock_device_hash_lock(conf, hash);
2531                 wait_event_cmd(conf->wait_for_stripe,
2532                                     !list_empty(conf->inactive_list + hash),
2533                                     unlock_device_hash_lock(conf, hash),
2534                                     lock_device_hash_lock(conf, hash));
2535                 osh = get_free_stripe(conf, hash);
2536                 unlock_device_hash_lock(conf, hash);
2537
2538 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2539         for (i = 0; i < osh->nr_pages; i++) {
2540                 nsh->pages[i] = osh->pages[i];
2541                 osh->pages[i] = NULL;
2542         }
2543 #endif
2544                 for(i=0; i<conf->pool_size; i++) {
2545                         nsh->dev[i].page = osh->dev[i].page;
2546                         nsh->dev[i].orig_page = osh->dev[i].page;
2547                         nsh->dev[i].offset = osh->dev[i].offset;
2548                 }
2549                 nsh->hash_lock_index = hash;
2550                 free_stripe(conf->slab_cache, osh);
2551                 cnt++;
2552                 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
2553                     !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
2554                         hash++;
2555                         cnt = 0;
2556                 }
2557         }
2558         kmem_cache_destroy(conf->slab_cache);
2559
2560         /* Step 3.
2561          * At this point, we are holding all the stripes so the array
2562          * is completely stalled, so now is a good time to resize
2563          * conf->disks and the scribble region
2564          */
2565         ndisks = kcalloc(newsize, sizeof(struct disk_info), GFP_NOIO);
2566         if (ndisks) {
2567                 for (i = 0; i < conf->pool_size; i++)
2568                         ndisks[i] = conf->disks[i];
2569
2570                 for (i = conf->pool_size; i < newsize; i++) {
2571                         ndisks[i].extra_page = alloc_page(GFP_NOIO);
2572                         if (!ndisks[i].extra_page)
2573                                 err = -ENOMEM;
2574                 }
2575
2576                 if (err) {
2577                         for (i = conf->pool_size; i < newsize; i++)
2578                                 if (ndisks[i].extra_page)
2579                                         put_page(ndisks[i].extra_page);
2580                         kfree(ndisks);
2581                 } else {
2582                         kfree(conf->disks);
2583                         conf->disks = ndisks;
2584                 }
2585         } else
2586                 err = -ENOMEM;
2587
2588         conf->slab_cache = sc;
2589         conf->active_name = 1-conf->active_name;
2590
2591         /* Step 4, return new stripes to service */
2592         while(!list_empty(&newstripes)) {
2593                 nsh = list_entry(newstripes.next, struct stripe_head, lru);
2594                 list_del_init(&nsh->lru);
2595
2596 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2597                 for (i = 0; i < nsh->nr_pages; i++) {
2598                         if (nsh->pages[i])
2599                                 continue;
2600                         nsh->pages[i] = alloc_page(GFP_NOIO);
2601                         if (!nsh->pages[i])
2602                                 err = -ENOMEM;
2603                 }
2604
2605                 for (i = conf->raid_disks; i < newsize; i++) {
2606                         if (nsh->dev[i].page)
2607                                 continue;
2608                         nsh->dev[i].page = raid5_get_dev_page(nsh, i);
2609                         nsh->dev[i].orig_page = nsh->dev[i].page;
2610                         nsh->dev[i].offset = raid5_get_page_offset(nsh, i);
2611                 }
2612 #else
2613                 for (i=conf->raid_disks; i < newsize; i++)
2614                         if (nsh->dev[i].page == NULL) {
2615                                 struct page *p = alloc_page(GFP_NOIO);
2616                                 nsh->dev[i].page = p;
2617                                 nsh->dev[i].orig_page = p;
2618                                 nsh->dev[i].offset = 0;
2619                                 if (!p)
2620                                         err = -ENOMEM;
2621                         }
2622 #endif
2623                 raid5_release_stripe(nsh);
2624         }
2625         /* critical section pass, GFP_NOIO no longer needed */
2626
2627         if (!err)
2628                 conf->pool_size = newsize;
2629         mutex_unlock(&conf->cache_size_mutex);
2630
2631         return err;
2632 }
2633
2634 static int drop_one_stripe(struct r5conf *conf)
2635 {
2636         struct stripe_head *sh;
2637         int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK;
2638
2639         spin_lock_irq(conf->hash_locks + hash);
2640         sh = get_free_stripe(conf, hash);
2641         spin_unlock_irq(conf->hash_locks + hash);
2642         if (!sh)
2643                 return 0;
2644         BUG_ON(atomic_read(&sh->count));
2645         shrink_buffers(sh);
2646         free_stripe(conf->slab_cache, sh);
2647         atomic_dec(&conf->active_stripes);
2648         conf->max_nr_stripes--;
2649         return 1;
2650 }
2651
2652 static void shrink_stripes(struct r5conf *conf)
2653 {
2654         while (conf->max_nr_stripes &&
2655                drop_one_stripe(conf))
2656                 ;
2657
2658         kmem_cache_destroy(conf->slab_cache);
2659         conf->slab_cache = NULL;
2660 }
2661
2662 /*
2663  * This helper wraps rcu_dereference_protected() and can be used when
2664  * it is known that the nr_pending of the rdev is elevated.
2665  */
2666 static struct md_rdev *rdev_pend_deref(struct md_rdev __rcu *rdev)
2667 {
2668         return rcu_dereference_protected(rdev,
2669                         atomic_read(&rcu_access_pointer(rdev)->nr_pending));
2670 }
2671
2672 /*
2673  * This helper wraps rcu_dereference_protected() and should be used
2674  * when it is known that the mddev_lock() is held. This is safe
2675  * seeing raid5_remove_disk() has the same lock held.
2676  */
2677 static struct md_rdev *rdev_mdlock_deref(struct mddev *mddev,
2678                                          struct md_rdev __rcu *rdev)
2679 {
2680         return rcu_dereference_protected(rdev,
2681                         lockdep_is_held(&mddev->reconfig_mutex));
2682 }
2683
2684 static void raid5_end_read_request(struct bio * bi)
2685 {
2686         struct stripe_head *sh = bi->bi_private;
2687         struct r5conf *conf = sh->raid_conf;
2688         int disks = sh->disks, i;
2689         struct md_rdev *rdev = NULL;
2690         sector_t s;
2691
2692         for (i=0 ; i<disks; i++)
2693                 if (bi == &sh->dev[i].req)
2694                         break;
2695
2696         pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
2697                 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2698                 bi->bi_status);
2699         if (i == disks) {
2700                 BUG();
2701                 return;
2702         }
2703         if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2704                 /* If replacement finished while this request was outstanding,
2705                  * 'replacement' might be NULL already.
2706                  * In that case it moved down to 'rdev'.
2707                  * rdev is not removed until all requests are finished.
2708                  */
2709                 rdev = rdev_pend_deref(conf->disks[i].replacement);
2710         if (!rdev)
2711                 rdev = rdev_pend_deref(conf->disks[i].rdev);
2712
2713         if (use_new_offset(conf, sh))
2714                 s = sh->sector + rdev->new_data_offset;
2715         else
2716                 s = sh->sector + rdev->data_offset;
2717         if (!bi->bi_status) {
2718                 set_bit(R5_UPTODATE, &sh->dev[i].flags);
2719                 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2720                         /* Note that this cannot happen on a
2721                          * replacement device.  We just fail those on
2722                          * any error
2723                          */
2724                         pr_info_ratelimited(
2725                                 "md/raid:%s: read error corrected (%lu sectors at %llu on %pg)\n",
2726                                 mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf),
2727                                 (unsigned long long)s,
2728                                 rdev->bdev);
2729                         atomic_add(RAID5_STRIPE_SECTORS(conf), &rdev->corrected_errors);
2730                         clear_bit(R5_ReadError, &sh->dev[i].flags);
2731                         clear_bit(R5_ReWrite, &sh->dev[i].flags);
2732                 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2733                         clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2734
2735                 if (test_bit(R5_InJournal, &sh->dev[i].flags))
2736                         /*
2737                          * end read for a page in journal, this
2738                          * must be preparing for prexor in rmw
2739                          */
2740                         set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
2741
2742                 if (atomic_read(&rdev->read_errors))
2743                         atomic_set(&rdev->read_errors, 0);
2744         } else {
2745                 int retry = 0;
2746                 int set_bad = 0;
2747
2748                 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
2749                 if (!(bi->bi_status == BLK_STS_PROTECTION))
2750                         atomic_inc(&rdev->read_errors);
2751                 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2752                         pr_warn_ratelimited(
2753                                 "md/raid:%s: read error on replacement device (sector %llu on %pg).\n",
2754                                 mdname(conf->mddev),
2755                                 (unsigned long long)s,
2756                                 rdev->bdev);
2757                 else if (conf->mddev->degraded >= conf->max_degraded) {
2758                         set_bad = 1;
2759                         pr_warn_ratelimited(
2760                                 "md/raid:%s: read error not correctable (sector %llu on %pg).\n",
2761                                 mdname(conf->mddev),
2762                                 (unsigned long long)s,
2763                                 rdev->bdev);
2764                 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
2765                         /* Oh, no!!! */
2766                         set_bad = 1;
2767                         pr_warn_ratelimited(
2768                                 "md/raid:%s: read error NOT corrected!! (sector %llu on %pg).\n",
2769                                 mdname(conf->mddev),
2770                                 (unsigned long long)s,
2771                                 rdev->bdev);
2772                 } else if (atomic_read(&rdev->read_errors)
2773                          > conf->max_nr_stripes) {
2774                         if (!test_bit(Faulty, &rdev->flags)) {
2775                                 pr_warn("md/raid:%s: %d read_errors > %d stripes\n",
2776                                     mdname(conf->mddev),
2777                                     atomic_read(&rdev->read_errors),
2778                                     conf->max_nr_stripes);
2779                                 pr_warn("md/raid:%s: Too many read errors, failing device %pg.\n",
2780                                     mdname(conf->mddev), rdev->bdev);
2781                         }
2782                 } else
2783                         retry = 1;
2784                 if (set_bad && test_bit(In_sync, &rdev->flags)
2785                     && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2786                         retry = 1;
2787                 if (retry)
2788                         if (sh->qd_idx >= 0 && sh->pd_idx == i)
2789                                 set_bit(R5_ReadError, &sh->dev[i].flags);
2790                         else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
2791                                 set_bit(R5_ReadError, &sh->dev[i].flags);
2792                                 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2793                         } else
2794                                 set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2795                 else {
2796                         clear_bit(R5_ReadError, &sh->dev[i].flags);
2797                         clear_bit(R5_ReWrite, &sh->dev[i].flags);
2798                         if (!(set_bad
2799                               && test_bit(In_sync, &rdev->flags)
2800                               && rdev_set_badblocks(
2801                                       rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)))
2802                                 md_error(conf->mddev, rdev);
2803                 }
2804         }
2805         rdev_dec_pending(rdev, conf->mddev);
2806         bio_uninit(bi);
2807         clear_bit(R5_LOCKED, &sh->dev[i].flags);
2808         set_bit(STRIPE_HANDLE, &sh->state);
2809         raid5_release_stripe(sh);
2810 }
2811
2812 static void raid5_end_write_request(struct bio *bi)
2813 {
2814         struct stripe_head *sh = bi->bi_private;
2815         struct r5conf *conf = sh->raid_conf;
2816         int disks = sh->disks, i;
2817         struct md_rdev *rdev;
2818         sector_t first_bad;
2819         int bad_sectors;
2820         int replacement = 0;
2821
2822         for (i = 0 ; i < disks; i++) {
2823                 if (bi == &sh->dev[i].req) {
2824                         rdev = rdev_pend_deref(conf->disks[i].rdev);
2825                         break;
2826                 }
2827                 if (bi == &sh->dev[i].rreq) {
2828                         rdev = rdev_pend_deref(conf->disks[i].replacement);
2829                         if (rdev)
2830                                 replacement = 1;
2831                         else
2832                                 /* rdev was removed and 'replacement'
2833                                  * replaced it.  rdev is not removed
2834                                  * until all requests are finished.
2835                                  */
2836                                 rdev = rdev_pend_deref(conf->disks[i].rdev);
2837                         break;
2838                 }
2839         }
2840         pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
2841                 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2842                 bi->bi_status);
2843         if (i == disks) {
2844                 BUG();
2845                 return;
2846         }
2847
2848         if (replacement) {
2849                 if (bi->bi_status)
2850                         md_error(conf->mddev, rdev);
2851                 else if (is_badblock(rdev, sh->sector,
2852                                      RAID5_STRIPE_SECTORS(conf),
2853                                      &first_bad, &bad_sectors))
2854                         set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
2855         } else {
2856                 if (bi->bi_status) {
2857                         set_bit(STRIPE_DEGRADED, &sh->state);
2858                         set_bit(WriteErrorSeen, &rdev->flags);
2859                         set_bit(R5_WriteError, &sh->dev[i].flags);
2860                         if (!test_and_set_bit(WantReplacement, &rdev->flags))
2861                                 set_bit(MD_RECOVERY_NEEDED,
2862                                         &rdev->mddev->recovery);
2863                 } else if (is_badblock(rdev, sh->sector,
2864                                        RAID5_STRIPE_SECTORS(conf),
2865                                        &first_bad, &bad_sectors)) {
2866                         set_bit(R5_MadeGood, &sh->dev[i].flags);
2867                         if (test_bit(R5_ReadError, &sh->dev[i].flags))
2868                                 /* That was a successful write so make
2869                                  * sure it looks like we already did
2870                                  * a re-write.
2871                                  */
2872                                 set_bit(R5_ReWrite, &sh->dev[i].flags);
2873                 }
2874         }
2875         rdev_dec_pending(rdev, conf->mddev);
2876
2877         if (sh->batch_head && bi->bi_status && !replacement)
2878                 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
2879
2880         bio_uninit(bi);
2881         if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
2882                 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2883         set_bit(STRIPE_HANDLE, &sh->state);
2884         raid5_release_stripe(sh);
2885
2886         if (sh->batch_head && sh != sh->batch_head)
2887                 raid5_release_stripe(sh->batch_head);
2888 }
2889
2890 static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
2891 {
2892         struct r5conf *conf = mddev->private;
2893         unsigned long flags;
2894         pr_debug("raid456: error called\n");
2895
2896         pr_crit("md/raid:%s: Disk failure on %pg, disabling device.\n",
2897                 mdname(mddev), rdev->bdev);
2898
2899         spin_lock_irqsave(&conf->device_lock, flags);
2900         set_bit(Faulty, &rdev->flags);
2901         clear_bit(In_sync, &rdev->flags);
2902         mddev->degraded = raid5_calc_degraded(conf);
2903
2904         if (has_failed(conf)) {
2905                 set_bit(MD_BROKEN, &conf->mddev->flags);
2906                 conf->recovery_disabled = mddev->recovery_disabled;
2907
2908                 pr_crit("md/raid:%s: Cannot continue operation (%d/%d failed).\n",
2909                         mdname(mddev), mddev->degraded, conf->raid_disks);
2910         } else {
2911                 pr_crit("md/raid:%s: Operation continuing on %d devices.\n",
2912                         mdname(mddev), conf->raid_disks - mddev->degraded);
2913         }
2914
2915         spin_unlock_irqrestore(&conf->device_lock, flags);
2916         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2917
2918         set_bit(Blocked, &rdev->flags);
2919         set_mask_bits(&mddev->sb_flags, 0,
2920                       BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
2921         r5c_update_on_rdev_error(mddev, rdev);
2922 }
2923
2924 /*
2925  * Input: a 'big' sector number,
2926  * Output: index of the data and parity disk, and the sector # in them.
2927  */
2928 sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
2929                               int previous, int *dd_idx,
2930                               struct stripe_head *sh)
2931 {
2932         sector_t stripe, stripe2;
2933         sector_t chunk_number;
2934         unsigned int chunk_offset;
2935         int pd_idx, qd_idx;
2936         int ddf_layout = 0;
2937         sector_t new_sector;
2938         int algorithm = previous ? conf->prev_algo
2939                                  : conf->algorithm;
2940         int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2941                                          : conf->chunk_sectors;
2942         int raid_disks = previous ? conf->previous_raid_disks
2943                                   : conf->raid_disks;
2944         int data_disks = raid_disks - conf->max_degraded;
2945
2946         /* First compute the information on this sector */
2947
2948         /*
2949          * Compute the chunk number and the sector offset inside the chunk
2950          */
2951         chunk_offset = sector_div(r_sector, sectors_per_chunk);
2952         chunk_number = r_sector;
2953
2954         /*
2955          * Compute the stripe number
2956          */
2957         stripe = chunk_number;
2958         *dd_idx = sector_div(stripe, data_disks);
2959         stripe2 = stripe;
2960         /*
2961          * Select the parity disk based on the user selected algorithm.
2962          */
2963         pd_idx = qd_idx = -1;
2964         switch(conf->level) {
2965         case 4:
2966                 pd_idx = data_disks;
2967                 break;
2968         case 5:
2969                 switch (algorithm) {
2970                 case ALGORITHM_LEFT_ASYMMETRIC:
2971                         pd_idx = data_disks - sector_div(stripe2, raid_disks);
2972                         if (*dd_idx >= pd_idx)
2973                                 (*dd_idx)++;
2974                         break;
2975                 case ALGORITHM_RIGHT_ASYMMETRIC:
2976                         pd_idx = sector_div(stripe2, raid_disks);
2977                         if (*dd_idx >= pd_idx)
2978                                 (*dd_idx)++;
2979                         break;
2980                 case ALGORITHM_LEFT_SYMMETRIC:
2981                         pd_idx = data_disks - sector_div(stripe2, raid_disks);
2982                         *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2983                         break;
2984                 case ALGORITHM_RIGHT_SYMMETRIC:
2985                         pd_idx = sector_div(stripe2, raid_disks);
2986                         *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2987                         break;
2988                 case ALGORITHM_PARITY_0:
2989                         pd_idx = 0;
2990                         (*dd_idx)++;
2991                         break;
2992                 case ALGORITHM_PARITY_N:
2993                         pd_idx = data_disks;
2994                         break;
2995                 default:
2996                         BUG();
2997                 }
2998                 break;
2999         case 6:
3000
3001                 switch (algorithm) {
3002                 case ALGORITHM_LEFT_ASYMMETRIC:
3003                         pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
3004                         qd_idx = pd_idx + 1;
3005                         if (pd_idx == raid_disks-1) {
3006                                 (*dd_idx)++;    /* Q D D D P */
3007                                 qd_idx = 0;
3008                         } else if (*dd_idx >= pd_idx)
3009                                 (*dd_idx) += 2; /* D D P Q D */
3010                         break;
3011                 case ALGORITHM_RIGHT_ASYMMETRIC:
3012                         pd_idx = sector_div(stripe2, raid_disks);
3013                         qd_idx = pd_idx + 1;
3014                         if (pd_idx == raid_disks-1) {
3015                                 (*dd_idx)++;    /* Q D D D P */
3016                                 qd_idx = 0;
3017                         } else if (*dd_idx >= pd_idx)
3018                                 (*dd_idx) += 2; /* D D P Q D */
3019                         break;
3020                 case ALGORITHM_LEFT_SYMMETRIC:
3021                         pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
3022                         qd_idx = (pd_idx + 1) % raid_disks;
3023                         *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
3024                         break;
3025                 case ALGORITHM_RIGHT_SYMMETRIC:
3026                         pd_idx = sector_div(stripe2, raid_disks);
3027                         qd_idx = (pd_idx + 1) % raid_disks;
3028                         *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
3029                         break;
3030
3031                 case ALGORITHM_PARITY_0:
3032                         pd_idx = 0;
3033                         qd_idx = 1;
3034                         (*dd_idx) += 2;
3035                         break;
3036                 case ALGORITHM_PARITY_N:
3037                         pd_idx = data_disks;
3038                         qd_idx = data_disks + 1;
3039                         break;
3040
3041                 case ALGORITHM_ROTATING_ZERO_RESTART:
3042                         /* Exactly the same as RIGHT_ASYMMETRIC, but or
3043                          * of blocks for computing Q is different.
3044                          */
3045                         pd_idx = sector_div(stripe2, raid_disks);
3046                         qd_idx = pd_idx + 1;
3047                         if (pd_idx == raid_disks-1) {
3048                                 (*dd_idx)++;    /* Q D D D P */
3049                                 qd_idx = 0;
3050                         } else if (*dd_idx >= pd_idx)
3051                                 (*dd_idx) += 2; /* D D P Q D */
3052                         ddf_layout = 1;
3053                         break;
3054
3055                 case ALGORITHM_ROTATING_N_RESTART:
3056                         /* Same a left_asymmetric, by first stripe is
3057                          * D D D P Q  rather than
3058                          * Q D D D P
3059                          */
3060                         stripe2 += 1;
3061                         pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
3062                         qd_idx = pd_idx + 1;
3063                         if (pd_idx == raid_disks-1) {
3064                                 (*dd_idx)++;    /* Q D D D P */
3065                                 qd_idx = 0;
3066                         } else if (*dd_idx >= pd_idx)
3067                                 (*dd_idx) += 2; /* D D P Q D */
3068                         ddf_layout = 1;
3069                         break;
3070
3071                 case ALGORITHM_ROTATING_N_CONTINUE:
3072                         /* Same as left_symmetric but Q is before P */
3073                         pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
3074                         qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
3075                         *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
3076                         ddf_layout = 1;
3077                         break;
3078
3079                 case ALGORITHM_LEFT_ASYMMETRIC_6:
3080                         /* RAID5 left_asymmetric, with Q on last device */
3081                         pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
3082                         if (*dd_idx >= pd_idx)
3083                                 (*dd_idx)++;
3084                         qd_idx = raid_disks - 1;
3085                         break;
3086
3087                 case ALGORITHM_RIGHT_ASYMMETRIC_6:
3088                         pd_idx = sector_div(stripe2, raid_disks-1);
3089                         if (*dd_idx >= pd_idx)
3090                                 (*dd_idx)++;
3091                         qd_idx = raid_disks - 1;
3092                         break;
3093
3094                 case ALGORITHM_LEFT_SYMMETRIC_6:
3095                         pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
3096                         *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
3097                         qd_idx = raid_disks - 1;
3098                         break;
3099
3100                 case ALGORITHM_RIGHT_SYMMETRIC_6:
3101                         pd_idx = sector_div(stripe2, raid_disks-1);
3102                         *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
3103                         qd_idx = raid_disks - 1;
3104                         break;
3105
3106                 case ALGORITHM_PARITY_0_6:
3107                         pd_idx = 0;
3108                         (*dd_idx)++;
3109                         qd_idx = raid_disks - 1;
3110                         break;
3111
3112                 default:
3113                         BUG();
3114                 }
3115                 break;
3116         }
3117
3118         if (sh) {
3119                 sh->pd_idx = pd_idx;
3120                 sh->qd_idx = qd_idx;
3121                 sh->ddf_layout = ddf_layout;
3122         }
3123         /*
3124          * Finally, compute the new sector number
3125          */
3126         new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
3127         return new_sector;
3128 }
3129
3130 sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
3131 {
3132         struct r5conf *conf = sh->raid_conf;
3133         int raid_disks = sh->disks;
3134         int data_disks = raid_disks - conf->max_degraded;
3135         sector_t new_sector = sh->sector, check;
3136         int sectors_per_chunk = previous ? conf->prev_chunk_sectors
3137                                          : conf->chunk_sectors;
3138         int algorithm = previous ? conf->prev_algo
3139                                  : conf->algorithm;
3140         sector_t stripe;
3141         int chunk_offset;
3142         sector_t chunk_number;
3143         int dummy1, dd_idx = i;
3144         sector_t r_sector;
3145         struct stripe_head sh2;
3146
3147         chunk_offset = sector_div(new_sector, sectors_per_chunk);
3148         stripe = new_sector;
3149
3150         if (i == sh->pd_idx)
3151                 return 0;
3152         switch(conf->level) {
3153         case 4: break;
3154         case 5:
3155                 switch (algorithm) {
3156                 case ALGORITHM_LEFT_ASYMMETRIC:
3157                 case ALGORITHM_RIGHT_ASYMMETRIC:
3158                         if (i > sh->pd_idx)
3159                                 i--;
3160                         break;
3161                 case ALGORITHM_LEFT_SYMMETRIC:
3162                 case ALGORITHM_RIGHT_SYMMETRIC:
3163                         if (i < sh->pd_idx)
3164                                 i += raid_disks;
3165                         i -= (sh->pd_idx + 1);
3166                         break;
3167                 case ALGORITHM_PARITY_0:
3168                         i -= 1;
3169                         break;
3170                 case ALGORITHM_PARITY_N:
3171                         break;
3172                 default:
3173                         BUG();
3174                 }
3175                 break;
3176         case 6:
3177                 if (i == sh->qd_idx)
3178                         return 0; /* It is the Q disk */
3179                 switch (algorithm) {
3180                 case ALGORITHM_LEFT_ASYMMETRIC:
3181                 case ALGORITHM_RIGHT_ASYMMETRIC:
3182                 case ALGORITHM_ROTATING_ZERO_RESTART:
3183                 case ALGORITHM_ROTATING_N_RESTART:
3184                         if (sh->pd_idx == raid_disks-1)
3185                                 i--;    /* Q D D D P */
3186                         else if (i > sh->pd_idx)
3187                                 i -= 2; /* D D P Q D */
3188                         break;
3189                 case ALGORITHM_LEFT_SYMMETRIC:
3190                 case ALGORITHM_RIGHT_SYMMETRIC:
3191                         if (sh->pd_idx == raid_disks-1)
3192                                 i--; /* Q D D D P */
3193                         else {
3194                                 /* D D P Q D */
3195                                 if (i < sh->pd_idx)
3196                                         i += raid_disks;
3197                                 i -= (sh->pd_idx + 2);
3198                         }
3199                         break;
3200                 case ALGORITHM_PARITY_0:
3201                         i -= 2;
3202                         break;
3203                 case ALGORITHM_PARITY_N:
3204                         break;
3205                 case ALGORITHM_ROTATING_N_CONTINUE:
3206                         /* Like left_symmetric, but P is before Q */
3207                         if (sh->pd_idx == 0)
3208                                 i--;    /* P D D D Q */
3209                         else {
3210                                 /* D D Q P D */
3211                                 if (i < sh->pd_idx)
3212                                         i += raid_disks;
3213                                 i -= (sh->pd_idx + 1);
3214                         }
3215                         break;
3216                 case ALGORITHM_LEFT_ASYMMETRIC_6:
3217                 case ALGORITHM_RIGHT_ASYMMETRIC_6:
3218                         if (i > sh->pd_idx)
3219                                 i--;
3220                         break;
3221                 case ALGORITHM_LEFT_SYMMETRIC_6:
3222                 case ALGORITHM_RIGHT_SYMMETRIC_6:
3223                         if (i < sh->pd_idx)
3224                                 i += data_disks + 1;
3225                         i -= (sh->pd_idx + 1);
3226                         break;
3227                 case ALGORITHM_PARITY_0_6:
3228                         i -= 1;
3229                         break;
3230                 default:
3231                         BUG();
3232                 }
3233                 break;
3234         }
3235
3236         chunk_number = stripe * data_disks + i;
3237         r_sector = chunk_number * sectors_per_chunk + chunk_offset;
3238
3239         check = raid5_compute_sector(conf, r_sector,
3240                                      previous, &dummy1, &sh2);
3241         if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
3242                 || sh2.qd_idx != sh->qd_idx) {
3243                 pr_warn("md/raid:%s: compute_blocknr: map not correct\n",
3244                         mdname(conf->mddev));
3245                 return 0;
3246         }
3247         return r_sector;
3248 }
3249
3250 /*
3251  * There are cases where we want handle_stripe_dirtying() and
3252  * schedule_reconstruction() to delay towrite to some dev of a stripe.
3253  *
3254  * This function checks whether we want to delay the towrite. Specifically,
3255  * we delay the towrite when:
3256  *
3257  *   1. degraded stripe has a non-overwrite to the missing dev, AND this
3258  *      stripe has data in journal (for other devices).
3259  *
3260  *      In this case, when reading data for the non-overwrite dev, it is
3261  *      necessary to handle complex rmw of write back cache (prexor with
3262  *      orig_page, and xor with page). To keep read path simple, we would
3263  *      like to flush data in journal to RAID disks first, so complex rmw
3264  *      is handled in the write patch (handle_stripe_dirtying).
3265  *
3266  *   2. when journal space is critical (R5C_LOG_CRITICAL=1)
3267  *
3268  *      It is important to be able to flush all stripes in raid5-cache.
3269  *      Therefore, we need reserve some space on the journal device for
3270  *      these flushes. If flush operation includes pending writes to the
3271  *      stripe, we need to reserve (conf->raid_disk + 1) pages per stripe
3272  *      for the flush out. If we exclude these pending writes from flush
3273  *      operation, we only need (conf->max_degraded + 1) pages per stripe.
3274  *      Therefore, excluding pending writes in these cases enables more
3275  *      efficient use of the journal device.
3276  *
3277  *      Note: To make sure the stripe makes progress, we only delay
3278  *      towrite for stripes with data already in journal (injournal > 0).
3279  *      When LOG_CRITICAL, stripes with injournal == 0 will be sent to
3280  *      no_space_stripes list.
3281  *
3282  *   3. during journal failure
3283  *      In journal failure, we try to flush all cached data to raid disks
3284  *      based on data in stripe cache. The array is read-only to upper
3285  *      layers, so we would skip all pending writes.
3286  *
3287  */
3288 static inline bool delay_towrite(struct r5conf *conf,
3289                                  struct r5dev *dev,
3290                                  struct stripe_head_state *s)
3291 {
3292         /* case 1 above */
3293         if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3294             !test_bit(R5_Insync, &dev->flags) && s->injournal)
3295                 return true;
3296         /* case 2 above */
3297         if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
3298             s->injournal > 0)
3299                 return true;
3300         /* case 3 above */
3301         if (s->log_failed && s->injournal)
3302                 return true;
3303         return false;
3304 }
3305
3306 static void
3307 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
3308                          int rcw, int expand)
3309 {
3310         int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
3311         struct r5conf *conf = sh->raid_conf;
3312         int level = conf->level;
3313
3314         if (rcw) {
3315                 /*
3316                  * In some cases, handle_stripe_dirtying initially decided to
3317                  * run rmw and allocates extra page for prexor. However, rcw is
3318                  * cheaper later on. We need to free the extra page now,
3319                  * because we won't be able to do that in ops_complete_prexor().
3320                  */
3321                 r5c_release_extra_page(sh);
3322
3323                 for (i = disks; i--; ) {
3324                         struct r5dev *dev = &sh->dev[i];
3325
3326                         if (dev->towrite && !delay_towrite(conf, dev, s)) {
3327                                 set_bit(R5_LOCKED, &dev->flags);
3328                                 set_bit(R5_Wantdrain, &dev->flags);
3329                                 if (!expand)
3330                                         clear_bit(R5_UPTODATE, &dev->flags);
3331                                 s->locked++;
3332                         } else if (test_bit(R5_InJournal, &dev->flags)) {
3333                                 set_bit(R5_LOCKED, &dev->flags);
3334                                 s->locked++;
3335                         }
3336                 }
3337                 /* if we are not expanding this is a proper write request, and
3338                  * there will be bios with new data to be drained into the
3339                  * stripe cache
3340                  */
3341                 if (!expand) {
3342                         if (!s->locked)
3343                                 /* False alarm, nothing to do */
3344                                 return;
3345                         sh->reconstruct_state = reconstruct_state_drain_run;
3346                         set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3347                 } else
3348                         sh->reconstruct_state = reconstruct_state_run;
3349
3350                 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3351
3352                 if (s->locked + conf->max_degraded == disks)
3353                         if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
3354                                 atomic_inc(&conf->pending_full_writes);
3355         } else {
3356                 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
3357                         test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
3358                 BUG_ON(level == 6 &&
3359                         (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
3360                            test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
3361
3362                 for (i = disks; i--; ) {
3363                         struct r5dev *dev = &sh->dev[i];
3364                         if (i == pd_idx || i == qd_idx)
3365                                 continue;
3366
3367                         if (dev->towrite &&
3368                             (test_bit(R5_UPTODATE, &dev->flags) ||
3369                              test_bit(R5_Wantcompute, &dev->flags))) {
3370                                 set_bit(R5_Wantdrain, &dev->flags);
3371                                 set_bit(R5_LOCKED, &dev->flags);
3372                                 clear_bit(R5_UPTODATE, &dev->flags);
3373                                 s->locked++;
3374                         } else if (test_bit(R5_InJournal, &dev->flags)) {
3375                                 set_bit(R5_LOCKED, &dev->flags);
3376                                 s->locked++;
3377                         }
3378                 }
3379                 if (!s->locked)
3380                         /* False alarm - nothing to do */
3381                         return;
3382                 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
3383                 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
3384                 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3385                 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3386         }
3387
3388         /* keep the parity disk(s) locked while asynchronous operations
3389          * are in flight
3390          */
3391         set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
3392         clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
3393         s->locked++;
3394
3395         if (level == 6) {
3396                 int qd_idx = sh->qd_idx;
3397                 struct r5dev *dev = &sh->dev[qd_idx];
3398
3399                 set_bit(R5_LOCKED, &dev->flags);
3400                 clear_bit(R5_UPTODATE, &dev->flags);
3401                 s->locked++;
3402         }
3403
3404         if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page &&
3405             test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
3406             !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
3407             test_bit(R5_Insync, &sh->dev[pd_idx].flags))
3408                 set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
3409
3410         pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
3411                 __func__, (unsigned long long)sh->sector,
3412                 s->locked, s->ops_request);
3413 }
3414
3415 /*
3416  * Each stripe/dev can have one or more bion attached.
3417  * toread/towrite point to the first in a chain.
3418  * The bi_next chain must be in order.
3419  */
3420 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
3421                           int forwrite, int previous)
3422 {
3423         struct bio **bip;
3424         struct r5conf *conf = sh->raid_conf;
3425         int firstwrite=0;
3426
3427         pr_debug("adding bi b#%llu to stripe s#%llu\n",
3428                 (unsigned long long)bi->bi_iter.bi_sector,
3429                 (unsigned long long)sh->sector);
3430
3431         spin_lock_irq(&sh->stripe_lock);
3432         /* Don't allow new IO added to stripes in batch list */
3433         if (sh->batch_head)
3434                 goto overlap;
3435         if (forwrite) {
3436                 bip = &sh->dev[dd_idx].towrite;
3437                 if (*bip == NULL)
3438                         firstwrite = 1;
3439         } else
3440                 bip = &sh->dev[dd_idx].toread;
3441         while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
3442                 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
3443                         goto overlap;
3444                 bip = & (*bip)->bi_next;
3445         }
3446         if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
3447                 goto overlap;
3448
3449         if (forwrite && raid5_has_ppl(conf)) {
3450                 /*
3451                  * With PPL only writes to consecutive data chunks within a
3452                  * stripe are allowed because for a single stripe_head we can
3453                  * only have one PPL entry at a time, which describes one data
3454                  * range. Not really an overlap, but wait_for_overlap can be
3455                  * used to handle this.
3456                  */
3457                 sector_t sector;
3458                 sector_t first = 0;
3459                 sector_t last = 0;
3460                 int count = 0;
3461                 int i;
3462
3463                 for (i = 0; i < sh->disks; i++) {
3464                         if (i != sh->pd_idx &&
3465                             (i == dd_idx || sh->dev[i].towrite)) {
3466                                 sector = sh->dev[i].sector;
3467                                 if (count == 0 || sector < first)
3468                                         first = sector;
3469                                 if (sector > last)
3470                                         last = sector;
3471                                 count++;
3472                         }
3473                 }
3474
3475                 if (first + conf->chunk_sectors * (count - 1) != last)
3476                         goto overlap;
3477         }
3478
3479         if (!forwrite || previous)
3480                 clear_bit(STRIPE_BATCH_READY, &sh->state);
3481
3482         BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
3483         if (*bip)
3484                 bi->bi_next = *bip;
3485         *bip = bi;
3486         bio_inc_remaining(bi);
3487         md_write_inc(conf->mddev, bi);
3488
3489         if (forwrite) {
3490                 /* check if page is covered */
3491                 sector_t sector = sh->dev[dd_idx].sector;
3492                 for (bi=sh->dev[dd_idx].towrite;
3493                      sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) &&
3494                              bi && bi->bi_iter.bi_sector <= sector;
3495                      bi = r5_next_bio(conf, bi, sh->dev[dd_idx].sector)) {
3496                         if (bio_end_sector(bi) >= sector)
3497                                 sector = bio_end_sector(bi);
3498                 }
3499                 if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf))
3500                         if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
3501                                 sh->overwrite_disks++;
3502         }
3503
3504         pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
3505                 (unsigned long long)(*bip)->bi_iter.bi_sector,
3506                 (unsigned long long)sh->sector, dd_idx);
3507
3508         if (conf->mddev->bitmap && firstwrite) {
3509                 /* Cannot hold spinlock over bitmap_startwrite,
3510                  * but must ensure this isn't added to a batch until
3511                  * we have added to the bitmap and set bm_seq.
3512                  * So set STRIPE_BITMAP_PENDING to prevent
3513                  * batching.
3514                  * If multiple add_stripe_bio() calls race here they
3515                  * much all set STRIPE_BITMAP_PENDING.  So only the first one
3516                  * to complete "bitmap_startwrite" gets to set
3517                  * STRIPE_BIT_DELAY.  This is important as once a stripe
3518                  * is added to a batch, STRIPE_BIT_DELAY cannot be changed
3519                  * any more.
3520                  */
3521                 set_bit(STRIPE_BITMAP_PENDING, &sh->state);
3522                 spin_unlock_irq(&sh->stripe_lock);
3523                 md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
3524                                      RAID5_STRIPE_SECTORS(conf), 0);
3525                 spin_lock_irq(&sh->stripe_lock);
3526                 clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
3527                 if (!sh->batch_head) {
3528                         sh->bm_seq = conf->seq_flush+1;
3529                         set_bit(STRIPE_BIT_DELAY, &sh->state);
3530                 }
3531         }
3532         spin_unlock_irq(&sh->stripe_lock);
3533
3534         if (stripe_can_batch(sh))
3535                 stripe_add_to_batch_list(conf, sh);
3536         return 1;
3537
3538  overlap:
3539         set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
3540         spin_unlock_irq(&sh->stripe_lock);
3541         return 0;
3542 }
3543
3544 static void end_reshape(struct r5conf *conf);
3545
3546 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
3547                             struct stripe_head *sh)
3548 {
3549         int sectors_per_chunk =
3550                 previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
3551         int dd_idx;
3552         int chunk_offset = sector_div(stripe, sectors_per_chunk);
3553         int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
3554
3555         raid5_compute_sector(conf,
3556                              stripe * (disks - conf->max_degraded)
3557                              *sectors_per_chunk + chunk_offset,
3558                              previous,
3559                              &dd_idx, sh);
3560 }
3561
3562 static void
3563 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
3564                      struct stripe_head_state *s, int disks)
3565 {
3566         int i;
3567         BUG_ON(sh->batch_head);
3568         for (i = disks; i--; ) {
3569                 struct bio *bi;
3570                 int bitmap_end = 0;
3571
3572                 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
3573                         struct md_rdev *rdev;
3574                         rcu_read_lock();
3575                         rdev = rcu_dereference(conf->disks[i].rdev);
3576                         if (rdev && test_bit(In_sync, &rdev->flags) &&
3577                             !test_bit(Faulty, &rdev->flags))
3578                                 atomic_inc(&rdev->nr_pending);
3579                         else
3580                                 rdev = NULL;
3581                         rcu_read_unlock();
3582                         if (rdev) {
3583                                 if (!rdev_set_badblocks(
3584                                             rdev,
3585                                             sh->sector,
3586                                             RAID5_STRIPE_SECTORS(conf), 0))
3587                                         md_error(conf->mddev, rdev);
3588                                 rdev_dec_pending(rdev, conf->mddev);
3589                         }
3590                 }
3591                 spin_lock_irq(&sh->stripe_lock);
3592                 /* fail all writes first */
3593                 bi = sh->dev[i].towrite;
3594                 sh->dev[i].towrite = NULL;
3595                 sh->overwrite_disks = 0;
3596                 spin_unlock_irq(&sh->stripe_lock);
3597                 if (bi)
3598                         bitmap_end = 1;
3599
3600                 log_stripe_write_finished(sh);
3601
3602                 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3603                         wake_up(&conf->wait_for_overlap);
3604
3605                 while (bi && bi->bi_iter.bi_sector <
3606                         sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3607                         struct bio *nextbi = r5_next_bio(conf, bi, sh->dev[i].sector);
3608
3609                         md_write_end(conf->mddev);
3610                         bio_io_error(bi);
3611                         bi = nextbi;
3612                 }
3613                 if (bitmap_end)
3614                         md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3615                                            RAID5_STRIPE_SECTORS(conf), 0, 0);
3616                 bitmap_end = 0;
3617                 /* and fail all 'written' */
3618                 bi = sh->dev[i].written;
3619                 sh->dev[i].written = NULL;
3620                 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) {
3621                         WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
3622                         sh->dev[i].page = sh->dev[i].orig_page;
3623                 }
3624
3625                 if (bi) bitmap_end = 1;
3626                 while (bi && bi->bi_iter.bi_sector <
3627                        sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3628                         struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
3629
3630                         md_write_end(conf->mddev);
3631                         bio_io_error(bi);
3632                         bi = bi2;
3633                 }
3634
3635                 /* fail any reads if this device is non-operational and
3636                  * the data has not reached the cache yet.
3637                  */
3638                 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
3639                     s->failed > conf->max_degraded &&
3640                     (!test_bit(R5_Insync, &sh->dev[i].flags) ||
3641                       test_bit(R5_ReadError, &sh->dev[i].flags))) {
3642                         spin_lock_irq(&sh->stripe_lock);
3643                         bi = sh->dev[i].toread;
3644                         sh->dev[i].toread = NULL;
3645                         spin_unlock_irq(&sh->stripe_lock);
3646                         if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3647                                 wake_up(&conf->wait_for_overlap);
3648                         if (bi)
3649                                 s->to_read--;
3650                         while (bi && bi->bi_iter.bi_sector <
3651                                sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3652                                 struct bio *nextbi =
3653                                         r5_next_bio(conf, bi, sh->dev[i].sector);
3654
3655                                 bio_io_error(bi);
3656                                 bi = nextbi;
3657                         }
3658                 }
3659                 if (bitmap_end)
3660                         md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3661                                            RAID5_STRIPE_SECTORS(conf), 0, 0);
3662                 /* If we were in the middle of a write the parity block might
3663                  * still be locked - so just clear all R5_LOCKED flags
3664                  */
3665                 clear_bit(R5_LOCKED, &sh->dev[i].flags);
3666         }
3667         s->to_write = 0;
3668         s->written = 0;
3669
3670         if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
3671                 if (atomic_dec_and_test(&conf->pending_full_writes))
3672                         md_wakeup_thread(conf->mddev->thread);
3673 }
3674
3675 static void
3676 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
3677                    struct stripe_head_state *s)
3678 {
3679         int abort = 0;
3680         int i;
3681
3682         BUG_ON(sh->batch_head);
3683         clear_bit(STRIPE_SYNCING, &sh->state);
3684         if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
3685                 wake_up(&conf->wait_for_overlap);
3686         s->syncing = 0;
3687         s->replacing = 0;
3688         /* There is nothing more to do for sync/check/repair.
3689          * Don't even need to abort as that is handled elsewhere
3690          * if needed, and not always wanted e.g. if there is a known
3691          * bad block here.
3692          * For recover/replace we need to record a bad block on all
3693          * non-sync devices, or abort the recovery
3694          */
3695         if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
3696                 /* During recovery devices cannot be removed, so
3697                  * locking and refcounting of rdevs is not needed
3698                  */
3699                 rcu_read_lock();
3700                 for (i = 0; i < conf->raid_disks; i++) {
3701                         struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
3702                         if (rdev
3703                             && !test_bit(Faulty, &rdev->flags)
3704                             && !test_bit(In_sync, &rdev->flags)
3705                             && !rdev_set_badblocks(rdev, sh->sector,
3706                                                    RAID5_STRIPE_SECTORS(conf), 0))
3707                                 abort = 1;
3708                         rdev = rcu_dereference(conf->disks[i].replacement);
3709                         if (rdev
3710                             && !test_bit(Faulty, &rdev->flags)
3711                             && !test_bit(In_sync, &rdev->flags)
3712                             && !rdev_set_badblocks(rdev, sh->sector,
3713                                                    RAID5_STRIPE_SECTORS(conf), 0))
3714                                 abort = 1;
3715                 }
3716                 rcu_read_unlock();
3717                 if (abort)
3718                         conf->recovery_disabled =
3719                                 conf->mddev->recovery_disabled;
3720         }
3721         md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort);
3722 }
3723
3724 static int want_replace(struct stripe_head *sh, int disk_idx)
3725 {
3726         struct md_rdev *rdev;
3727         int rv = 0;
3728
3729         rcu_read_lock();
3730         rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement);
3731         if (rdev
3732             && !test_bit(Faulty, &rdev->flags)
3733             && !test_bit(In_sync, &rdev->flags)
3734             && (rdev->recovery_offset <= sh->sector
3735                 || rdev->mddev->recovery_cp <= sh->sector))
3736                 rv = 1;
3737         rcu_read_unlock();
3738         return rv;
3739 }
3740
3741 static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
3742                            int disk_idx, int disks)
3743 {
3744         struct r5dev *dev = &sh->dev[disk_idx];
3745         struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
3746                                   &sh->dev[s->failed_num[1]] };
3747         int i;
3748         bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW);
3749
3750
3751         if (test_bit(R5_LOCKED, &dev->flags) ||
3752             test_bit(R5_UPTODATE, &dev->flags))
3753                 /* No point reading this as we already have it or have
3754                  * decided to get it.
3755                  */
3756                 return 0;
3757
3758         if (dev->toread ||
3759             (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)))
3760                 /* We need this block to directly satisfy a request */
3761                 return 1;
3762
3763         if (s->syncing || s->expanding ||
3764             (s->replacing && want_replace(sh, disk_idx)))
3765                 /* When syncing, or expanding we read everything.
3766                  * When replacing, we need the replaced block.
3767                  */
3768                 return 1;
3769
3770         if ((s->failed >= 1 && fdev[0]->toread) ||
3771             (s->failed >= 2 && fdev[1]->toread))
3772                 /* If we want to read from a failed device, then
3773                  * we need to actually read every other device.
3774                  */
3775                 return 1;
3776
3777         /* Sometimes neither read-modify-write nor reconstruct-write
3778          * cycles can work.  In those cases we read every block we
3779          * can.  Then the parity-update is certain to have enough to
3780          * work with.
3781          * This can only be a problem when we need to write something,
3782          * and some device has failed.  If either of those tests
3783          * fail we need look no further.
3784          */
3785         if (!s->failed || !s->to_write)
3786                 return 0;
3787
3788         if (test_bit(R5_Insync, &dev->flags) &&
3789             !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3790                 /* Pre-reads at not permitted until after short delay
3791                  * to gather multiple requests.  However if this
3792                  * device is no Insync, the block could only be computed
3793                  * and there is no need to delay that.
3794                  */
3795                 return 0;
3796
3797         for (i = 0; i < s->failed && i < 2; i++) {
3798                 if (fdev[i]->towrite &&
3799                     !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3800                     !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3801                         /* If we have a partial write to a failed
3802                          * device, then we will need to reconstruct
3803                          * the content of that device, so all other
3804                          * devices must be read.
3805                          */
3806                         return 1;
3807
3808                 if (s->failed >= 2 &&
3809                     (fdev[i]->towrite ||
3810                      s->failed_num[i] == sh->pd_idx ||
3811                      s->failed_num[i] == sh->qd_idx) &&
3812                     !test_bit(R5_UPTODATE, &fdev[i]->flags))
3813                         /* In max degraded raid6, If the failed disk is P, Q,
3814                          * or we want to read the failed disk, we need to do
3815                          * reconstruct-write.
3816                          */
3817                         force_rcw = true;
3818         }
3819
3820         /* If we are forced to do a reconstruct-write, because parity
3821          * cannot be trusted and we are currently recovering it, there
3822          * is extra need to be careful.
3823          * If one of the devices that we would need to read, because
3824          * it is not being overwritten (and maybe not written at all)
3825          * is missing/faulty, then we need to read everything we can.
3826          */
3827         if (!force_rcw &&
3828             sh->sector < sh->raid_conf->mddev->recovery_cp)
3829                 /* reconstruct-write isn't being forced */
3830                 return 0;
3831         for (i = 0; i < s->failed && i < 2; i++) {
3832                 if (s->failed_num[i] != sh->pd_idx &&
3833                     s->failed_num[i] != sh->qd_idx &&
3834                     !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3835                     !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3836                         return 1;
3837         }
3838
3839         return 0;
3840 }
3841
3842 /* fetch_block - checks the given member device to see if its data needs
3843  * to be read or computed to satisfy a request.
3844  *
3845  * Returns 1 when no more member devices need to be checked, otherwise returns
3846  * 0 to tell the loop in handle_stripe_fill to continue
3847  */
3848 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
3849                        int disk_idx, int disks)
3850 {
3851         struct r5dev *dev = &sh->dev[disk_idx];
3852
3853         /* is the data in this block needed, and can we get it? */
3854         if (need_this_block(sh, s, disk_idx, disks)) {
3855                 /* we would like to get this block, possibly by computing it,
3856                  * otherwise read it if the backing disk is insync
3857                  */
3858                 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
3859                 BUG_ON(test_bit(R5_Wantread, &dev->flags));
3860                 BUG_ON(sh->batch_head);
3861
3862                 /*
3863                  * In the raid6 case if the only non-uptodate disk is P
3864                  * then we already trusted P to compute the other failed
3865                  * drives. It is safe to compute rather than re-read P.
3866                  * In other cases we only compute blocks from failed
3867                  * devices, otherwise check/repair might fail to detect
3868                  * a real inconsistency.
3869                  */
3870
3871                 if ((s->uptodate == disks - 1) &&
3872                     ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) ||
3873                     (s->failed && (disk_idx == s->failed_num[0] ||
3874                                    disk_idx == s->failed_num[1])))) {
3875                         /* have disk failed, and we're requested to fetch it;
3876                          * do compute it
3877                          */
3878                         pr_debug("Computing stripe %llu block %d\n",
3879                                (unsigned long long)sh->sector, disk_idx);
3880                         set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3881                         set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3882                         set_bit(R5_Wantcompute, &dev->flags);
3883                         sh->ops.target = disk_idx;
3884                         sh->ops.target2 = -1; /* no 2nd target */
3885                         s->req_compute = 1;
3886                         /* Careful: from this point on 'uptodate' is in the eye
3887                          * of raid_run_ops which services 'compute' operations
3888                          * before writes. R5_Wantcompute flags a block that will
3889                          * be R5_UPTODATE by the time it is needed for a
3890                          * subsequent operation.
3891                          */
3892                         s->uptodate++;
3893                         return 1;
3894                 } else if (s->uptodate == disks-2 && s->failed >= 2) {
3895                         /* Computing 2-failure is *very* expensive; only
3896                          * do it if failed >= 2
3897                          */
3898                         int other;
3899                         for (other = disks; other--; ) {
3900                                 if (other == disk_idx)
3901                                         continue;
3902                                 if (!test_bit(R5_UPTODATE,
3903                                       &sh->dev[other].flags))
3904                                         break;
3905                         }
3906                         BUG_ON(other < 0);
3907                         pr_debug("Computing stripe %llu blocks %d,%d\n",
3908                                (unsigned long long)sh->sector,
3909                                disk_idx, other);
3910                         set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3911                         set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3912                         set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
3913                         set_bit(R5_Wantcompute, &sh->dev[other].flags);
3914                         sh->ops.target = disk_idx;
3915                         sh->ops.target2 = other;
3916                         s->uptodate += 2;
3917                         s->req_compute = 1;
3918                         return 1;
3919                 } else if (test_bit(R5_Insync, &dev->flags)) {
3920                         set_bit(R5_LOCKED, &dev->flags);
3921                         set_bit(R5_Wantread, &dev->flags);
3922                         s->locked++;
3923                         pr_debug("Reading block %d (sync=%d)\n",
3924                                 disk_idx, s->syncing);
3925                 }
3926         }
3927
3928         return 0;
3929 }
3930
3931 /*
3932  * handle_stripe_fill - read or compute data to satisfy pending requests.
3933  */
3934 static void handle_stripe_fill(struct stripe_head *sh,
3935                                struct stripe_head_state *s,
3936                                int disks)
3937 {
3938         int i;
3939
3940         /* look for blocks to read/compute, skip this if a compute
3941          * is already in flight, or if the stripe contents are in the
3942          * midst of changing due to a write
3943          */
3944         if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
3945             !sh->reconstruct_state) {
3946
3947                 /*
3948                  * For degraded stripe with data in journal, do not handle
3949                  * read requests yet, instead, flush the stripe to raid
3950                  * disks first, this avoids handling complex rmw of write
3951                  * back cache (prexor with orig_page, and then xor with
3952                  * page) in the read path
3953                  */
3954                 if (s->injournal && s->failed) {
3955                         if (test_bit(STRIPE_R5C_CACHING, &sh->state))
3956                                 r5c_make_stripe_write_out(sh);
3957                         goto out;
3958                 }
3959
3960                 for (i = disks; i--; )
3961                         if (fetch_block(sh, s, i, disks))
3962                                 break;
3963         }
3964 out:
3965         set_bit(STRIPE_HANDLE, &sh->state);
3966 }
3967
3968 static void break_stripe_batch_list(struct stripe_head *head_sh,
3969                                     unsigned long handle_flags);
3970 /* handle_stripe_clean_event
3971  * any written block on an uptodate or failed drive can be returned.
3972  * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
3973  * never LOCKED, so we don't need to test 'failed' directly.
3974  */
3975 static void handle_stripe_clean_event(struct r5conf *conf,
3976         struct stripe_head *sh, int disks)
3977 {
3978         int i;
3979         struct r5dev *dev;
3980         int discard_pending = 0;
3981         struct stripe_head *head_sh = sh;
3982         bool do_endio = false;
3983
3984         for (i = disks; i--; )
3985                 if (sh->dev[i].written) {
3986                         dev = &sh->dev[i];
3987                         if (!test_bit(R5_LOCKED, &dev->flags) &&
3988                             (test_bit(R5_UPTODATE, &dev->flags) ||
3989                              test_bit(R5_Discard, &dev->flags) ||
3990                              test_bit(R5_SkipCopy, &dev->flags))) {
3991                                 /* We can return any write requests */
3992                                 struct bio *wbi, *wbi2;
3993                                 pr_debug("Return write for disc %d\n", i);
3994                                 if (test_and_clear_bit(R5_Discard, &dev->flags))
3995                                         clear_bit(R5_UPTODATE, &dev->flags);
3996                                 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
3997                                         WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
3998                                 }
3999                                 do_endio = true;
4000
4001 returnbi:
4002                                 dev->page = dev->orig_page;
4003                                 wbi = dev->written;
4004                                 dev->written = NULL;
4005                                 while (wbi && wbi->bi_iter.bi_sector <
4006                                         dev->sector + RAID5_STRIPE_SECTORS(conf)) {
4007                                         wbi2 = r5_next_bio(conf, wbi, dev->sector);
4008                                         md_write_end(conf->mddev);
4009                                         bio_endio(wbi);
4010                                         wbi = wbi2;
4011                                 }
4012                                 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
4013                                                    RAID5_STRIPE_SECTORS(conf),
4014                                                    !test_bit(STRIPE_DEGRADED, &sh->state),
4015                                                    0);
4016                                 if (head_sh->batch_head) {
4017                                         sh = list_first_entry(&sh->batch_list,
4018                                                               struct stripe_head,
4019                                                               batch_list);
4020                                         if (sh != head_sh) {
4021                                                 dev = &sh->dev[i];
4022                                                 goto returnbi;
4023                                         }
4024                                 }
4025                                 sh = head_sh;
4026                                 dev = &sh->dev[i];
4027                         } else if (test_bit(R5_Discard, &dev->flags))
4028                                 discard_pending = 1;
4029                 }
4030
4031         log_stripe_write_finished(sh);
4032
4033         if (!discard_pending &&
4034             test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
4035                 int hash;
4036                 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
4037                 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
4038                 if (sh->qd_idx >= 0) {
4039                         clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
4040                         clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags);
4041                 }
4042                 /* now that discard is done we can proceed with any sync */
4043                 clear_bit(STRIPE_DISCARD, &sh->state);
4044                 /*
4045                  * SCSI discard will change some bio fields and the stripe has
4046                  * no updated data, so remove it from hash list and the stripe
4047                  * will be reinitialized
4048                  */
4049 unhash:
4050                 hash = sh->hash_lock_index;
4051                 spin_lock_irq(conf->hash_locks + hash);
4052                 remove_hash(sh);
4053                 spin_unlock_irq(conf->hash_locks + hash);
4054                 if (head_sh->batch_head) {
4055                         sh = list_first_entry(&sh->batch_list,
4056                                               struct stripe_head, batch_list);
4057                         if (sh != head_sh)
4058                                         goto unhash;
4059                 }
4060                 sh = head_sh;
4061
4062                 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
4063                         set_bit(STRIPE_HANDLE, &sh->state);
4064
4065         }
4066
4067         if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
4068                 if (atomic_dec_and_test(&conf->pending_full_writes))
4069                         md_wakeup_thread(conf->mddev->thread);
4070
4071         if (head_sh->batch_head && do_endio)
4072                 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
4073 }
4074
4075 /*
4076  * For RMW in write back cache, we need extra page in prexor to store the
4077  * old data. This page is stored in dev->orig_page.
4078  *
4079  * This function checks whether we have data for prexor. The exact logic
4080  * is:
4081  *       R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE)
4082  */
4083 static inline bool uptodate_for_rmw(struct r5dev *dev)
4084 {
4085         return (test_bit(R5_UPTODATE, &dev->flags)) &&
4086                 (!test_bit(R5_InJournal, &dev->flags) ||
4087                  test_bit(R5_OrigPageUPTDODATE, &dev->flags));
4088 }
4089
4090 static int handle_stripe_dirtying(struct r5conf *conf,
4091                                   struct stripe_head *sh,
4092                                   struct stripe_head_state *s,
4093                                   int disks)
4094 {
4095         int rmw = 0, rcw = 0, i;
4096         sector_t recovery_cp = conf->mddev->recovery_cp;
4097
4098         /* Check whether resync is now happening or should start.
4099          * If yes, then the array is dirty (after unclean shutdown or
4100          * initial creation), so parity in some stripes might be inconsistent.
4101          * In this case, we need to always do reconstruct-write, to ensure
4102          * that in case of drive failure or read-error correction, we
4103          * generate correct data from the parity.
4104          */
4105         if (conf->rmw_level == PARITY_DISABLE_RMW ||
4106             (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
4107              s->failed == 0)) {
4108                 /* Calculate the real rcw later - for now make it
4109                  * look like rcw is cheaper
4110                  */
4111                 rcw = 1; rmw = 2;
4112                 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
4113                          conf->rmw_level, (unsigned long long)recovery_cp,
4114                          (unsigned long long)sh->sector);
4115         } else for (i = disks; i--; ) {
4116                 /* would I have to read this buffer for read_modify_write */
4117                 struct r5dev *dev = &sh->dev[i];
4118                 if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
4119                      i == sh->pd_idx || i == sh->qd_idx ||
4120                      test_bit(R5_InJournal, &dev->flags)) &&
4121                     !test_bit(R5_LOCKED, &dev->flags) &&
4122                     !(uptodate_for_rmw(dev) ||
4123                       test_bit(R5_Wantcompute, &dev->flags))) {
4124                         if (test_bit(R5_Insync, &dev->flags))
4125                                 rmw++;
4126                         else
4127                                 rmw += 2*disks;  /* cannot read it */
4128                 }
4129                 /* Would I have to read this buffer for reconstruct_write */
4130                 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
4131                     i != sh->pd_idx && i != sh->qd_idx &&
4132                     !test_bit(R5_LOCKED, &dev->flags) &&
4133                     !(test_bit(R5_UPTODATE, &dev->flags) ||
4134                       test_bit(R5_Wantcompute, &dev->flags))) {
4135                         if (test_bit(R5_Insync, &dev->flags))
4136                                 rcw++;
4137                         else
4138                                 rcw += 2*disks;
4139                 }
4140         }
4141
4142         pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n",
4143                  (unsigned long long)sh->sector, sh->state, rmw, rcw);
4144         set_bit(STRIPE_HANDLE, &sh->state);
4145         if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
4146                 /* prefer read-modify-write, but need to get some data */
4147                 if (conf->mddev->queue)
4148                         blk_add_trace_msg(conf->mddev->queue,
4149                                           "raid5 rmw %llu %d",
4150                                           (unsigned long long)sh->sector, rmw);
4151                 for (i = disks; i--; ) {
4152                         struct r5dev *dev = &sh->dev[i];
4153                         if (test_bit(R5_InJournal, &dev->flags) &&
4154                             dev->page == dev->orig_page &&
4155                             !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
4156                                 /* alloc page for prexor */
4157                                 struct page *p = alloc_page(GFP_NOIO);
4158
4159                                 if (p) {
4160                                         dev->orig_page = p;
4161                                         continue;
4162                                 }
4163
4164                                 /*
4165                                  * alloc_page() failed, try use
4166                                  * disk_info->extra_page
4167                                  */
4168                                 if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE,
4169                                                       &conf->cache_state)) {
4170                                         r5c_use_extra_page(sh);
4171                                         break;
4172                                 }
4173
4174                                 /* extra_page in use, add to delayed_list */
4175                                 set_bit(STRIPE_DELAYED, &sh->state);
4176                                 s->waiting_extra_page = 1;
4177                                 return -EAGAIN;
4178                         }
4179                 }
4180
4181                 for (i = disks; i--; ) {
4182                         struct r5dev *dev = &sh->dev[i];
4183                         if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
4184                              i == sh->pd_idx || i == sh->qd_idx ||
4185                              test_bit(R5_InJournal, &dev->flags)) &&
4186                             !test_bit(R5_LOCKED, &dev->flags) &&
4187                             !(uptodate_for_rmw(dev) ||
4188                               test_bit(R5_Wantcompute, &dev->flags)) &&
4189                             test_bit(R5_Insync, &dev->flags)) {
4190                                 if (test_bit(STRIPE_PREREAD_ACTIVE,
4191                                              &sh->state)) {
4192                                         pr_debug("Read_old block %d for r-m-w\n",
4193                                                  i);
4194                                         set_bit(R5_LOCKED, &dev->flags);
4195                                         set_bit(R5_Wantread, &dev->flags);
4196                                         s->locked++;
4197                                 } else
4198                                         set_bit(STRIPE_DELAYED, &sh->state);
4199                         }
4200                 }
4201         }
4202         if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) {
4203                 /* want reconstruct write, but need to get some data */
4204                 int qread =0;
4205                 rcw = 0;
4206                 for (i = disks; i--; ) {
4207                         struct r5dev *dev = &sh->dev[i];
4208                         if (!test_bit(R5_OVERWRITE, &dev->flags) &&
4209                             i != sh->pd_idx && i != sh->qd_idx &&
4210                             !test_bit(R5_LOCKED, &dev->flags) &&
4211                             !(test_bit(R5_UPTODATE, &dev->flags) ||
4212                               test_bit(R5_Wantcompute, &dev->flags))) {
4213                                 rcw++;
4214                                 if (test_bit(R5_Insync, &dev->flags) &&
4215                                     test_bit(STRIPE_PREREAD_ACTIVE,
4216                                              &sh->state)) {
4217                                         pr_debug("Read_old block "
4218                                                 "%d for Reconstruct\n", i);
4219                                         set_bit(R5_LOCKED, &dev->flags);
4220                                         set_bit(R5_Wantread, &dev->flags);
4221                                         s->locked++;
4222                                         qread++;
4223                                 } else
4224                                         set_bit(STRIPE_DELAYED, &sh->state);
4225                         }
4226                 }
4227                 if (rcw && conf->mddev->queue)
4228                         blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
4229                                           (unsigned long long)sh->sector,
4230                                           rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
4231         }
4232
4233         if (rcw > disks && rmw > disks &&
4234             !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4235                 set_bit(STRIPE_DELAYED, &sh->state);
4236
4237         /* now if nothing is locked, and if we have enough data,
4238          * we can start a write request
4239          */
4240         /* since handle_stripe can be called at any time we need to handle the
4241          * case where a compute block operation has been submitted and then a
4242          * subsequent call wants to start a write request.  raid_run_ops only
4243          * handles the case where compute block and reconstruct are requested
4244          * simultaneously.  If this is not the case then new writes need to be
4245          * held off until the compute completes.
4246          */
4247         if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
4248             (s->locked == 0 && (rcw == 0 || rmw == 0) &&
4249              !test_bit(STRIPE_BIT_DELAY, &sh->state)))
4250                 schedule_reconstruction(sh, s, rcw == 0, 0);
4251         return 0;
4252 }
4253
4254 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
4255                                 struct stripe_head_state *s, int disks)
4256 {
4257         struct r5dev *dev = NULL;
4258
4259         BUG_ON(sh->batch_head);
4260         set_bit(STRIPE_HANDLE, &sh->state);
4261
4262         switch (sh->check_state) {
4263         case check_state_idle:
4264                 /* start a new check operation if there are no failures */
4265                 if (s->failed == 0) {
4266                         BUG_ON(s->uptodate != disks);
4267                         sh->check_state = check_state_run;
4268                         set_bit(STRIPE_OP_CHECK, &s->ops_request);
4269                         clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
4270                         s->uptodate--;
4271                         break;
4272                 }
4273                 dev = &sh->dev[s->failed_num[0]];
4274                 fallthrough;
4275         case check_state_compute_result:
4276                 sh->check_state = check_state_idle;
4277                 if (!dev)
4278                         dev = &sh->dev[sh->pd_idx];
4279
4280                 /* check that a write has not made the stripe insync */
4281                 if (test_bit(STRIPE_INSYNC, &sh->state))
4282                         break;
4283
4284                 /* either failed parity check, or recovery is happening */
4285                 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
4286                 BUG_ON(s->uptodate != disks);
4287
4288                 set_bit(R5_LOCKED, &dev->flags);
4289                 s->locked++;
4290                 set_bit(R5_Wantwrite, &dev->flags);
4291
4292                 clear_bit(STRIPE_DEGRADED, &sh->state);
4293                 set_bit(STRIPE_INSYNC, &sh->state);
4294                 break;
4295         case check_state_run:
4296                 break; /* we will be called again upon completion */
4297         case check_state_check_result:
4298                 sh->check_state = check_state_idle;
4299
4300                 /* if a failure occurred during the check operation, leave
4301                  * STRIPE_INSYNC not set and let the stripe be handled again
4302                  */
4303                 if (s->failed)
4304                         break;
4305
4306                 /* handle a successful check operation, if parity is correct
4307                  * we are done.  Otherwise update the mismatch count and repair
4308                  * parity if !MD_RECOVERY_CHECK
4309                  */
4310                 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
4311                         /* parity is correct (on disc,
4312                          * not in buffer any more)
4313                          */
4314                         set_bit(STRIPE_INSYNC, &sh->state);
4315                 else {
4316                         atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
4317                         if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4318                                 /* don't try to repair!! */
4319                                 set_bit(STRIPE_INSYNC, &sh->state);
4320                                 pr_warn_ratelimited("%s: mismatch sector in range "
4321                                                     "%llu-%llu\n", mdname(conf->mddev),
4322                                                     (unsigned long long) sh->sector,
4323                                                     (unsigned long long) sh->sector +
4324                                                     RAID5_STRIPE_SECTORS(conf));
4325                         } else {
4326                                 sh->check_state = check_state_compute_run;
4327                                 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4328                                 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4329                                 set_bit(R5_Wantcompute,
4330                                         &sh->dev[sh->pd_idx].flags);
4331                                 sh->ops.target = sh->pd_idx;
4332                                 sh->ops.target2 = -1;
4333                                 s->uptodate++;
4334                         }
4335                 }
4336                 break;
4337         case check_state_compute_run:
4338                 break;
4339         default:
4340                 pr_err("%s: unknown check_state: %d sector: %llu\n",
4341                        __func__, sh->check_state,
4342                        (unsigned long long) sh->sector);
4343                 BUG();
4344         }
4345 }
4346
4347 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
4348                                   struct stripe_head_state *s,
4349                                   int disks)
4350 {
4351         int pd_idx = sh->pd_idx;
4352         int qd_idx = sh->qd_idx;
4353         struct r5dev *dev;
4354
4355         BUG_ON(sh->batch_head);
4356         set_bit(STRIPE_HANDLE, &sh->state);
4357
4358         BUG_ON(s->failed > 2);
4359
4360         /* Want to check and possibly repair P and Q.
4361          * However there could be one 'failed' device, in which
4362          * case we can only check one of them, possibly using the
4363          * other to generate missing data
4364          */
4365
4366         switch (sh->check_state) {
4367         case check_state_idle:
4368                 /* start a new check operation if there are < 2 failures */
4369                 if (s->failed == s->q_failed) {
4370                         /* The only possible failed device holds Q, so it
4371                          * makes sense to check P (If anything else were failed,
4372                          * we would have used P to recreate it).
4373                          */
4374                         sh->check_state = check_state_run;
4375                 }
4376                 if (!s->q_failed && s->failed < 2) {
4377                         /* Q is not failed, and we didn't use it to generate
4378                          * anything, so it makes sense to check it
4379                          */
4380                         if (sh->check_state == check_state_run)
4381                                 sh->check_state = check_state_run_pq;
4382                         else
4383                                 sh->check_state = check_state_run_q;
4384                 }
4385
4386                 /* discard potentially stale zero_sum_result */
4387                 sh->ops.zero_sum_result = 0;
4388
4389                 if (sh->check_state == check_state_run) {
4390                         /* async_xor_zero_sum destroys the contents of P */
4391                         clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
4392                         s->uptodate--;
4393                 }
4394                 if (sh->check_state >= check_state_run &&
4395                     sh->check_state <= check_state_run_pq) {
4396                         /* async_syndrome_zero_sum preserves P and Q, so
4397                          * no need to mark them !uptodate here
4398                          */
4399                         set_bit(STRIPE_OP_CHECK, &s->ops_request);
4400                         break;
4401                 }
4402
4403                 /* we have 2-disk failure */
4404                 BUG_ON(s->failed != 2);
4405                 fallthrough;
4406         case check_state_compute_result:
4407                 sh->check_state = check_state_idle;
4408
4409                 /* check that a write has not made the stripe insync */
4410                 if (test_bit(STRIPE_INSYNC, &sh->state))
4411                         break;
4412
4413                 /* now write out any block on a failed drive,
4414                  * or P or Q if they were recomputed
4415                  */
4416                 dev = NULL;
4417                 if (s->failed == 2) {
4418                         dev = &sh->dev[s->failed_num[1]];
4419                         s->locked++;
4420                         set_bit(R5_LOCKED, &dev->flags);
4421                         set_bit(R5_Wantwrite, &dev->flags);
4422                 }
4423                 if (s->failed >= 1) {
4424                         dev = &sh->dev[s->failed_num[0]];
4425                         s->locked++;
4426                         set_bit(R5_LOCKED, &dev->flags);
4427                         set_bit(R5_Wantwrite, &dev->flags);
4428                 }
4429                 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4430                         dev = &sh->dev[pd_idx];
4431                         s->locked++;
4432                         set_bit(R5_LOCKED, &dev->flags);
4433                         set_bit(R5_Wantwrite, &dev->flags);
4434                 }
4435                 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4436                         dev = &sh->dev[qd_idx];
4437                         s->locked++;
4438                         set_bit(R5_LOCKED, &dev->flags);
4439                         set_bit(R5_Wantwrite, &dev->flags);
4440                 }
4441                 if (WARN_ONCE(dev && !test_bit(R5_UPTODATE, &dev->flags),
4442                               "%s: disk%td not up to date\n",
4443                               mdname(conf->mddev),
4444                               dev - (struct r5dev *) &sh->dev)) {
4445                         clear_bit(R5_LOCKED, &dev->flags);
4446                         clear_bit(R5_Wantwrite, &dev->flags);
4447                         s->locked--;
4448                 }
4449                 clear_bit(STRIPE_DEGRADED, &sh->state);
4450
4451                 set_bit(STRIPE_INSYNC, &sh->state);
4452                 break;
4453         case check_state_run:
4454         case check_state_run_q:
4455         case check_state_run_pq:
4456                 break; /* we will be called again upon completion */
4457         case check_state_check_result:
4458                 sh->check_state = check_state_idle;
4459
4460                 /* handle a successful check operation, if parity is correct
4461                  * we are done.  Otherwise update the mismatch count and repair
4462                  * parity if !MD_RECOVERY_CHECK
4463                  */
4464                 if (sh->ops.zero_sum_result == 0) {
4465                         /* both parities are correct */
4466                         if (!s->failed)
4467                                 set_bit(STRIPE_INSYNC, &sh->state);
4468                         else {
4469                                 /* in contrast to the raid5 case we can validate
4470                                  * parity, but still have a failure to write
4471                                  * back
4472                                  */
4473                                 sh->check_state = check_state_compute_result;
4474                                 /* Returning at this point means that we may go
4475                                  * off and bring p and/or q uptodate again so
4476                                  * we make sure to check zero_sum_result again
4477                                  * to verify if p or q need writeback
4478                                  */
4479                         }
4480                 } else {
4481                         atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
4482                         if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4483                                 /* don't try to repair!! */
4484                                 set_bit(STRIPE_INSYNC, &sh->state);
4485                                 pr_warn_ratelimited("%s: mismatch sector in range "
4486                                                     "%llu-%llu\n", mdname(conf->mddev),
4487                                                     (unsigned long long) sh->sector,
4488                                                     (unsigned long long) sh->sector +
4489                                                     RAID5_STRIPE_SECTORS(conf));
4490                         } else {
4491                                 int *target = &sh->ops.target;
4492
4493                                 sh->ops.target = -1;
4494                                 sh->ops.target2 = -1;
4495                                 sh->check_state = check_state_compute_run;
4496                                 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4497                                 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4498                                 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4499                                         set_bit(R5_Wantcompute,
4500                                                 &sh->dev[pd_idx].flags);
4501                                         *target = pd_idx;
4502                                         target = &sh->ops.target2;
4503                                         s->uptodate++;
4504                                 }
4505                                 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4506                                         set_bit(R5_Wantcompute,
4507                                                 &sh->dev[qd_idx].flags);
4508                                         *target = qd_idx;
4509                                         s->uptodate++;
4510                                 }
4511                         }
4512                 }
4513                 break;
4514         case check_state_compute_run:
4515                 break;
4516         default:
4517                 pr_warn("%s: unknown check_state: %d sector: %llu\n",
4518                         __func__, sh->check_state,
4519                         (unsigned long long) sh->sector);
4520                 BUG();
4521         }
4522 }
4523
4524 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
4525 {
4526         int i;
4527
4528         /* We have read all the blocks in this stripe and now we need to
4529          * copy some of them into a target stripe for expand.
4530          */
4531         struct dma_async_tx_descriptor *tx = NULL;
4532         BUG_ON(sh->batch_head);
4533         clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
4534         for (i = 0; i < sh->disks; i++)
4535                 if (i != sh->pd_idx && i != sh->qd_idx) {
4536                         int dd_idx, j;
4537                         struct stripe_head *sh2;
4538                         struct async_submit_ctl submit;
4539
4540                         sector_t bn = raid5_compute_blocknr(sh, i, 1);
4541                         sector_t s = raid5_compute_sector(conf, bn, 0,
4542                                                           &dd_idx, NULL);
4543                         sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1);
4544                         if (sh2 == NULL)
4545                                 /* so far only the early blocks of this stripe
4546                                  * have been requested.  When later blocks
4547                                  * get requested, we will try again
4548                                  */
4549                                 continue;
4550                         if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
4551                            test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
4552                                 /* must have already done this block */
4553                                 raid5_release_stripe(sh2);
4554                                 continue;
4555                         }
4556
4557                         /* place all the copies on one channel */
4558                         init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
4559                         tx = async_memcpy(sh2->dev[dd_idx].page,
4560                                           sh->dev[i].page, sh2->dev[dd_idx].offset,
4561                                           sh->dev[i].offset, RAID5_STRIPE_SIZE(conf),
4562                                           &submit);
4563
4564                         set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
4565                         set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
4566                         for (j = 0; j < conf->raid_disks; j++)
4567                                 if (j != sh2->pd_idx &&
4568                                     j != sh2->qd_idx &&
4569                                     !test_bit(R5_Expanded, &sh2->dev[j].flags))
4570                                         break;
4571                         if (j == conf->raid_disks) {
4572                                 set_bit(STRIPE_EXPAND_READY, &sh2->state);
4573                                 set_bit(STRIPE_HANDLE, &sh2->state);
4574                         }
4575                         raid5_release_stripe(sh2);
4576
4577                 }
4578         /* done submitting copies, wait for them to complete */
4579         async_tx_quiesce(&tx);
4580 }
4581
4582 /*
4583  * handle_stripe - do things to a stripe.
4584  *
4585  * We lock the stripe by setting STRIPE_ACTIVE and then examine the
4586  * state of various bits to see what needs to be done.
4587  * Possible results:
4588  *    return some read requests which now have data
4589  *    return some write requests which are safely on storage
4590  *    schedule a read on some buffers
4591  *    schedule a write of some buffers
4592  *    return confirmation of parity correctness
4593  *
4594  */
4595
4596 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
4597 {
4598         struct r5conf *conf = sh->raid_conf;
4599         int disks = sh->disks;
4600         struct r5dev *dev;
4601         int i;
4602         int do_recovery = 0;
4603
4604         memset(s, 0, sizeof(*s));
4605
4606         s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
4607         s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
4608         s->failed_num[0] = -1;
4609         s->failed_num[1] = -1;
4610         s->log_failed = r5l_log_disk_error(conf);
4611
4612         /* Now to look around and see what can be done */
4613         rcu_read_lock();
4614         for (i=disks; i--; ) {
4615                 struct md_rdev *rdev;
4616                 sector_t first_bad;
4617                 int bad_sectors;
4618                 int is_bad = 0;
4619
4620                 dev = &sh->dev[i];
4621
4622                 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
4623                          i, dev->flags,
4624                          dev->toread, dev->towrite, dev->written);
4625                 /* maybe we can reply to a read
4626                  *
4627                  * new wantfill requests are only permitted while
4628                  * ops_complete_biofill is guaranteed to be inactive
4629                  */
4630                 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
4631                     !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
4632                         set_bit(R5_Wantfill, &dev->flags);
4633
4634                 /* now count some things */
4635                 if (test_bit(R5_LOCKED, &dev->flags))
4636                         s->locked++;
4637                 if (test_bit(R5_UPTODATE, &dev->flags))
4638                         s->uptodate++;
4639                 if (test_bit(R5_Wantcompute, &dev->flags)) {
4640                         s->compute++;
4641                         BUG_ON(s->compute > 2);
4642                 }
4643
4644                 if (test_bit(R5_Wantfill, &dev->flags))
4645                         s->to_fill++;
4646                 else if (dev->toread)
4647                         s->to_read++;
4648                 if (dev->towrite) {
4649                         s->to_write++;
4650                         if (!test_bit(R5_OVERWRITE, &dev->flags))
4651                                 s->non_overwrite++;
4652                 }
4653                 if (dev->written)
4654                         s->written++;
4655                 /* Prefer to use the replacement for reads, but only
4656                  * if it is recovered enough and has no bad blocks.
4657                  */
4658                 rdev = rcu_dereference(conf->disks[i].replacement);
4659                 if (rdev && !test_bit(Faulty, &rdev->flags) &&
4660                     rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
4661                     !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
4662                                  &first_bad, &bad_sectors))
4663                         set_bit(R5_ReadRepl, &dev->flags);
4664                 else {
4665                         if (rdev && !test_bit(Faulty, &rdev->flags))
4666                                 set_bit(R5_NeedReplace, &dev->flags);
4667                         else
4668                                 clear_bit(R5_NeedReplace, &dev->flags);
4669                         rdev = rcu_dereference(conf->disks[i].rdev);
4670                         clear_bit(R5_ReadRepl, &dev->flags);
4671                 }
4672                 if (rdev && test_bit(Faulty, &rdev->flags))
4673                         rdev = NULL;
4674                 if (rdev) {
4675                         is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
4676                                              &first_bad, &bad_sectors);
4677                         if (s->blocked_rdev == NULL
4678                             && (test_bit(Blocked, &rdev->flags)
4679                                 || is_bad < 0)) {
4680                                 if (is_bad < 0)
4681                                         set_bit(BlockedBadBlocks,
4682                                                 &rdev->flags);
4683                                 s->blocked_rdev = rdev;
4684                                 atomic_inc(&rdev->nr_pending);
4685                         }
4686                 }
4687                 clear_bit(R5_Insync, &dev->flags);
4688                 if (!rdev)
4689                         /* Not in-sync */;
4690                 else if (is_bad) {
4691                         /* also not in-sync */
4692                         if (!test_bit(WriteErrorSeen, &rdev->flags) &&
4693                             test_bit(R5_UPTODATE, &dev->flags)) {
4694                                 /* treat as in-sync, but with a read error
4695                                  * which we can now try to correct
4696                                  */
4697                                 set_bit(R5_Insync, &dev->flags);
4698                                 set_bit(R5_ReadError, &dev->flags);
4699                         }
4700                 } else if (test_bit(In_sync, &rdev->flags))
4701                         set_bit(R5_Insync, &dev->flags);
4702                 else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset)
4703                         /* in sync if before recovery_offset */
4704                         set_bit(R5_Insync, &dev->flags);
4705                 else if (test_bit(R5_UPTODATE, &dev->flags) &&
4706                          test_bit(R5_Expanded, &dev->flags))
4707                         /* If we've reshaped into here, we assume it is Insync.
4708                          * We will shortly update recovery_offset to make
4709                          * it official.
4710                          */
4711                         set_bit(R5_Insync, &dev->flags);
4712
4713                 if (test_bit(R5_WriteError, &dev->flags)) {
4714                         /* This flag does not apply to '.replacement'
4715                          * only to .rdev, so make sure to check that*/
4716                         struct md_rdev *rdev2 = rcu_dereference(
4717                                 conf->disks[i].rdev);
4718                         if (rdev2 == rdev)
4719                                 clear_bit(R5_Insync, &dev->flags);
4720                         if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4721                                 s->handle_bad_blocks = 1;
4722                                 atomic_inc(&rdev2->nr_pending);
4723                         } else
4724                                 clear_bit(R5_WriteError, &dev->flags);
4725                 }
4726                 if (test_bit(R5_MadeGood, &dev->flags)) {
4727                         /* This flag does not apply to '.replacement'
4728                          * only to .rdev, so make sure to check that*/
4729                         struct md_rdev *rdev2 = rcu_dereference(
4730                                 conf->disks[i].rdev);
4731                         if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4732                                 s->handle_bad_blocks = 1;
4733                                 atomic_inc(&rdev2->nr_pending);
4734                         } else
4735                                 clear_bit(R5_MadeGood, &dev->flags);
4736                 }
4737                 if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
4738                         struct md_rdev *rdev2 = rcu_dereference(
4739                                 conf->disks[i].replacement);
4740                         if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4741                                 s->handle_bad_blocks = 1;
4742                                 atomic_inc(&rdev2->nr_pending);
4743                         } else
4744                                 clear_bit(R5_MadeGoodRepl, &dev->flags);
4745                 }
4746                 if (!test_bit(R5_Insync, &dev->flags)) {
4747                         /* The ReadError flag will just be confusing now */
4748                         clear_bit(R5_ReadError, &dev->flags);
4749                         clear_bit(R5_ReWrite, &dev->flags);
4750                 }
4751                 if (test_bit(R5_ReadError, &dev->flags))
4752                         clear_bit(R5_Insync, &dev->flags);
4753                 if (!test_bit(R5_Insync, &dev->flags)) {
4754                         if (s->failed < 2)
4755                                 s->failed_num[s->failed] = i;
4756                         s->failed++;
4757                         if (rdev && !test_bit(Faulty, &rdev->flags))
4758                                 do_recovery = 1;
4759                         else if (!rdev) {
4760                                 rdev = rcu_dereference(
4761                                     conf->disks[i].replacement);
4762                                 if (rdev && !test_bit(Faulty, &rdev->flags))
4763                                         do_recovery = 1;
4764                         }
4765                 }
4766
4767                 if (test_bit(R5_InJournal, &dev->flags))
4768                         s->injournal++;
4769                 if (test_bit(R5_InJournal, &dev->flags) && dev->written)
4770                         s->just_cached++;
4771         }
4772         if (test_bit(STRIPE_SYNCING, &sh->state)) {
4773                 /* If there is a failed device being replaced,
4774                  *     we must be recovering.
4775                  * else if we are after recovery_cp, we must be syncing
4776                  * else if MD_RECOVERY_REQUESTED is set, we also are syncing.
4777                  * else we can only be replacing
4778                  * sync and recovery both need to read all devices, and so
4779                  * use the same flag.
4780                  */
4781                 if (do_recovery ||
4782                     sh->sector >= conf->mddev->recovery_cp ||
4783                     test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
4784                         s->syncing = 1;
4785                 else
4786                         s->replacing = 1;
4787         }
4788         rcu_read_unlock();
4789 }
4790
4791 /*
4792  * Return '1' if this is a member of batch, or '0' if it is a lone stripe or
4793  * a head which can now be handled.
4794  */
4795 static int clear_batch_ready(struct stripe_head *sh)
4796 {
4797         struct stripe_head *tmp;
4798         if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
4799                 return (sh->batch_head && sh->batch_head != sh);
4800         spin_lock(&sh->stripe_lock);
4801         if (!sh->batch_head) {
4802                 spin_unlock(&sh->stripe_lock);
4803                 return 0;
4804         }
4805
4806         /*
4807          * this stripe could be added to a batch list before we check
4808          * BATCH_READY, skips it
4809          */
4810         if (sh->batch_head != sh) {
4811                 spin_unlock(&sh->stripe_lock);
4812                 return 1;
4813         }
4814         spin_lock(&sh->batch_lock);
4815         list_for_each_entry(tmp, &sh->batch_list, batch_list)
4816                 clear_bit(STRIPE_BATCH_READY, &tmp->state);
4817         spin_unlock(&sh->batch_lock);
4818         spin_unlock(&sh->stripe_lock);
4819
4820         /*
4821          * BATCH_READY is cleared, no new stripes can be added.
4822          * batch_list can be accessed without lock
4823          */
4824         return 0;
4825 }
4826
4827 static void break_stripe_batch_list(struct stripe_head *head_sh,
4828                                     unsigned long handle_flags)
4829 {
4830         struct stripe_head *sh, *next;
4831         int i;
4832         int do_wakeup = 0;
4833
4834         list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
4835
4836                 list_del_init(&sh->batch_list);
4837
4838                 WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
4839                                           (1 << STRIPE_SYNCING) |
4840                                           (1 << STRIPE_REPLACED) |
4841                                           (1 << STRIPE_DELAYED) |
4842                                           (1 << STRIPE_BIT_DELAY) |
4843                                           (1 << STRIPE_FULL_WRITE) |
4844                                           (1 << STRIPE_BIOFILL_RUN) |
4845                                           (1 << STRIPE_COMPUTE_RUN)  |
4846                                           (1 << STRIPE_DISCARD) |
4847                                           (1 << STRIPE_BATCH_READY) |
4848                                           (1 << STRIPE_BATCH_ERR) |
4849                                           (1 << STRIPE_BITMAP_PENDING)),
4850                         "stripe state: %lx\n", sh->state);
4851                 WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
4852                                               (1 << STRIPE_REPLACED)),
4853                         "head stripe state: %lx\n", head_sh->state);
4854
4855                 set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
4856                                             (1 << STRIPE_PREREAD_ACTIVE) |
4857                                             (1 << STRIPE_DEGRADED) |
4858                                             (1 << STRIPE_ON_UNPLUG_LIST)),
4859                               head_sh->state & (1 << STRIPE_INSYNC));
4860
4861                 sh->check_state = head_sh->check_state;
4862                 sh->reconstruct_state = head_sh->reconstruct_state;
4863                 spin_lock_irq(&sh->stripe_lock);
4864                 sh->batch_head = NULL;
4865                 spin_unlock_irq(&sh->stripe_lock);
4866                 for (i = 0; i < sh->disks; i++) {
4867                         if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
4868                                 do_wakeup = 1;
4869                         sh->dev[i].flags = head_sh->dev[i].flags &
4870                                 (~((1 << R5_WriteError) | (1 << R5_Overlap)));
4871                 }
4872                 if (handle_flags == 0 ||
4873                     sh->state & handle_flags)
4874                         set_bit(STRIPE_HANDLE, &sh->state);
4875                 raid5_release_stripe(sh);
4876         }
4877         spin_lock_irq(&head_sh->stripe_lock);
4878         head_sh->batch_head = NULL;
4879         spin_unlock_irq(&head_sh->stripe_lock);
4880         for (i = 0; i < head_sh->disks; i++)
4881                 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
4882                         do_wakeup = 1;
4883         if (head_sh->state & handle_flags)
4884                 set_bit(STRIPE_HANDLE, &head_sh->state);
4885
4886         if (do_wakeup)
4887                 wake_up(&head_sh->raid_conf->wait_for_overlap);
4888 }
4889
4890 static void handle_stripe(struct stripe_head *sh)
4891 {
4892         struct stripe_head_state s;
4893         struct r5conf *conf = sh->raid_conf;
4894         int i;
4895         int prexor;
4896         int disks = sh->disks;
4897         struct r5dev *pdev, *qdev;
4898
4899         clear_bit(STRIPE_HANDLE, &sh->state);
4900
4901         /*
4902          * handle_stripe should not continue handle the batched stripe, only
4903          * the head of batch list or lone stripe can continue. Otherwise we
4904          * could see break_stripe_batch_list warns about the STRIPE_ACTIVE
4905          * is set for the batched stripe.
4906          */
4907         if (clear_batch_ready(sh))
4908                 return;
4909
4910         if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
4911                 /* already being handled, ensure it gets handled
4912                  * again when current action finishes */
4913                 set_bit(STRIPE_HANDLE, &sh->state);
4914                 return;
4915         }
4916
4917         if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
4918                 break_stripe_batch_list(sh, 0);
4919
4920         if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
4921                 spin_lock(&sh->stripe_lock);
4922                 /*
4923                  * Cannot process 'sync' concurrently with 'discard'.
4924                  * Flush data in r5cache before 'sync'.
4925                  */
4926                 if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
4927                     !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
4928                     !test_bit(STRIPE_DISCARD, &sh->state) &&
4929                     test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
4930                         set_bit(STRIPE_SYNCING, &sh->state);
4931                         clear_bit(STRIPE_INSYNC, &sh->state);
4932                         clear_bit(STRIPE_REPLACED, &sh->state);
4933                 }
4934                 spin_unlock(&sh->stripe_lock);
4935         }
4936         clear_bit(STRIPE_DELAYED, &sh->state);
4937
4938         pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
4939                 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
4940                (unsigned long long)sh->sector, sh->state,
4941                atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
4942                sh->check_state, sh->reconstruct_state);
4943
4944         analyse_stripe(sh, &s);
4945
4946         if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
4947                 goto finish;
4948
4949         if (s.handle_bad_blocks ||
4950             test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
4951                 set_bit(STRIPE_HANDLE, &sh->state);
4952                 goto finish;
4953         }
4954
4955         if (unlikely(s.blocked_rdev)) {
4956                 if (s.syncing || s.expanding || s.expanded ||
4957                     s.replacing || s.to_write || s.written) {
4958                         set_bit(STRIPE_HANDLE, &sh->state);
4959                         goto finish;
4960                 }
4961                 /* There is nothing for the blocked_rdev to block */
4962                 rdev_dec_pending(s.blocked_rdev, conf->mddev);
4963                 s.blocked_rdev = NULL;
4964         }
4965
4966         if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
4967                 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
4968                 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
4969         }
4970
4971         pr_debug("locked=%d uptodate=%d to_read=%d"
4972                " to_write=%d failed=%d failed_num=%d,%d\n",
4973                s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
4974                s.failed_num[0], s.failed_num[1]);
4975         /*
4976          * check if the array has lost more than max_degraded devices and,
4977          * if so, some requests might need to be failed.
4978          *
4979          * When journal device failed (log_failed), we will only process
4980          * the stripe if there is data need write to raid disks
4981          */
4982         if (s.failed > conf->max_degraded ||
4983             (s.log_failed && s.injournal == 0)) {
4984                 sh->check_state = 0;
4985                 sh->reconstruct_state = 0;
4986                 break_stripe_batch_list(sh, 0);
4987                 if (s.to_read+s.to_write+s.written)
4988                         handle_failed_stripe(conf, sh, &s, disks);
4989                 if (s.syncing + s.replacing)
4990                         handle_failed_sync(conf, sh, &s);
4991         }
4992
4993         /* Now we check to see if any write operations have recently
4994          * completed
4995          */
4996         prexor = 0;
4997         if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
4998                 prexor = 1;
4999         if (sh->reconstruct_state == reconstruct_state_drain_result ||
5000             sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
5001                 sh->reconstruct_state = reconstruct_state_idle;
5002
5003                 /* All the 'written' buffers and the parity block are ready to
5004                  * be written back to disk
5005                  */
5006                 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
5007                        !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
5008                 BUG_ON(sh->qd_idx >= 0 &&
5009                        !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
5010                        !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
5011                 for (i = disks; i--; ) {
5012                         struct r5dev *dev = &sh->dev[i];
5013                         if (test_bit(R5_LOCKED, &dev->flags) &&
5014                                 (i == sh->pd_idx || i == sh->qd_idx ||
5015                                  dev->written || test_bit(R5_InJournal,
5016                                                           &dev->flags))) {
5017                                 pr_debug("Writing block %d\n", i);
5018                                 set_bit(R5_Wantwrite, &dev->flags);
5019                                 if (prexor)
5020                                         continue;
5021                                 if (s.failed > 1)
5022                                         continue;
5023                                 if (!test_bit(R5_Insync, &dev->flags) ||
5024                                     ((i == sh->pd_idx || i == sh->qd_idx)  &&
5025                                      s.failed == 0))
5026                                         set_bit(STRIPE_INSYNC, &sh->state);
5027                         }
5028                 }
5029                 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5030                         s.dec_preread_active = 1;
5031         }
5032
5033         /*
5034          * might be able to return some write requests if the parity blocks
5035          * are safe, or on a failed drive
5036          */
5037         pdev = &sh->dev[sh->pd_idx];
5038         s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
5039                 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
5040         qdev = &sh->dev[sh->qd_idx];
5041         s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
5042                 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
5043                 || conf->level < 6;
5044
5045         if (s.written &&
5046             (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
5047                              && !test_bit(R5_LOCKED, &pdev->flags)
5048                              && (test_bit(R5_UPTODATE, &pdev->flags) ||
5049                                  test_bit(R5_Discard, &pdev->flags))))) &&
5050             (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
5051                              && !test_bit(R5_LOCKED, &qdev->flags)
5052                              && (test_bit(R5_UPTODATE, &qdev->flags) ||
5053                                  test_bit(R5_Discard, &qdev->flags))))))
5054                 handle_stripe_clean_event(conf, sh, disks);
5055
5056         if (s.just_cached)
5057                 r5c_handle_cached_data_endio(conf, sh, disks);
5058         log_stripe_write_finished(sh);
5059
5060         /* Now we might consider reading some blocks, either to check/generate
5061          * parity, or to satisfy requests
5062          * or to load a block that is being partially written.
5063          */
5064         if (s.to_read || s.non_overwrite
5065             || (s.to_write && s.failed)
5066             || (s.syncing && (s.uptodate + s.compute < disks))
5067             || s.replacing
5068             || s.expanding)
5069                 handle_stripe_fill(sh, &s, disks);
5070
5071         /*
5072          * When the stripe finishes full journal write cycle (write to journal
5073          * and raid disk), this is the clean up procedure so it is ready for
5074          * next operation.
5075          */
5076         r5c_finish_stripe_write_out(conf, sh, &s);
5077
5078         /*
5079          * Now to consider new write requests, cache write back and what else,
5080          * if anything should be read.  We do not handle new writes when:
5081          * 1/ A 'write' operation (copy+xor) is already in flight.
5082          * 2/ A 'check' operation is in flight, as it may clobber the parity
5083          *    block.
5084          * 3/ A r5c cache log write is in flight.
5085          */
5086
5087         if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
5088                 if (!r5c_is_writeback(conf->log)) {
5089                         if (s.to_write)
5090                                 handle_stripe_dirtying(conf, sh, &s, disks);
5091                 } else { /* write back cache */
5092                         int ret = 0;
5093
5094                         /* First, try handle writes in caching phase */
5095                         if (s.to_write)
5096                                 ret = r5c_try_caching_write(conf, sh, &s,
5097                                                             disks);
5098                         /*
5099                          * If caching phase failed: ret == -EAGAIN
5100                          *    OR
5101                          * stripe under reclaim: !caching && injournal
5102                          *
5103                          * fall back to handle_stripe_dirtying()
5104                          */
5105                         if (ret == -EAGAIN ||
5106                             /* stripe under reclaim: !caching && injournal */
5107                             (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
5108                              s.injournal > 0)) {
5109                                 ret = handle_stripe_dirtying(conf, sh, &s,
5110                                                              disks);
5111                                 if (ret == -EAGAIN)
5112                                         goto finish;
5113                         }
5114                 }
5115         }
5116
5117         /* maybe we need to check and possibly fix the parity for this stripe
5118          * Any reads will already have been scheduled, so we just see if enough
5119          * data is available.  The parity check is held off while parity
5120          * dependent operations are in flight.
5121          */
5122         if (sh->check_state ||
5123             (s.syncing && s.locked == 0 &&
5124              !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
5125              !test_bit(STRIPE_INSYNC, &sh->state))) {
5126                 if (conf->level == 6)
5127                         handle_parity_checks6(conf, sh, &s, disks);
5128                 else
5129                         handle_parity_checks5(conf, sh, &s, disks);
5130         }
5131
5132         if ((s.replacing || s.syncing) && s.locked == 0
5133             && !test_bit(STRIPE_COMPUTE_RUN, &sh->state)
5134             && !test_bit(STRIPE_REPLACED, &sh->state)) {
5135                 /* Write out to replacement devices where possible */
5136                 for (i = 0; i < conf->raid_disks; i++)
5137                         if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
5138                                 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags));
5139                                 set_bit(R5_WantReplace, &sh->dev[i].flags);
5140                                 set_bit(R5_LOCKED, &sh->dev[i].flags);
5141                                 s.locked++;
5142                         }
5143                 if (s.replacing)
5144                         set_bit(STRIPE_INSYNC, &sh->state);
5145                 set_bit(STRIPE_REPLACED, &sh->state);
5146         }
5147         if ((s.syncing || s.replacing) && s.locked == 0 &&
5148             !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
5149             test_bit(STRIPE_INSYNC, &sh->state)) {
5150                 md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
5151                 clear_bit(STRIPE_SYNCING, &sh->state);
5152                 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
5153                         wake_up(&conf->wait_for_overlap);
5154         }
5155
5156         /* If the failed drives are just a ReadError, then we might need
5157          * to progress the repair/check process
5158          */
5159         if (s.failed <= conf->max_degraded && !conf->mddev->ro)
5160                 for (i = 0; i < s.failed; i++) {
5161                         struct r5dev *dev = &sh->dev[s.failed_num[i]];
5162                         if (test_bit(R5_ReadError, &dev->flags)
5163                             && !test_bit(R5_LOCKED, &dev->flags)
5164                             && test_bit(R5_UPTODATE, &dev->flags)
5165                                 ) {
5166                                 if (!test_bit(R5_ReWrite, &dev->flags)) {
5167                                         set_bit(R5_Wantwrite, &dev->flags);
5168                                         set_bit(R5_ReWrite, &dev->flags);
5169                                 } else
5170                                         /* let's read it back */
5171                                         set_bit(R5_Wantread, &dev->flags);
5172                                 set_bit(R5_LOCKED, &dev->flags);
5173                                 s.locked++;
5174                         }
5175                 }
5176
5177         /* Finish reconstruct operations initiated by the expansion process */
5178         if (sh->reconstruct_state == reconstruct_state_result) {
5179                 struct stripe_head *sh_src
5180                         = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1);
5181                 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
5182                         /* sh cannot be written until sh_src has been read.
5183                          * so arrange for sh to be delayed a little
5184                          */
5185                         set_bit(STRIPE_DELAYED, &sh->state);
5186                         set_bit(STRIPE_HANDLE, &sh->state);
5187                         if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
5188                                               &sh_src->state))
5189                                 atomic_inc(&conf->preread_active_stripes);
5190                         raid5_release_stripe(sh_src);
5191                         goto finish;
5192                 }
5193                 if (sh_src)
5194                         raid5_release_stripe(sh_src);
5195
5196                 sh->reconstruct_state = reconstruct_state_idle;
5197                 clear_bit(STRIPE_EXPANDING, &sh->state);
5198                 for (i = conf->raid_disks; i--; ) {
5199                         set_bit(R5_Wantwrite, &sh->dev[i].flags);
5200                         set_bit(R5_LOCKED, &sh->dev[i].flags);
5201                         s.locked++;
5202                 }
5203         }
5204
5205         if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
5206             !sh->reconstruct_state) {
5207                 /* Need to write out all blocks after computing parity */
5208                 sh->disks = conf->raid_disks;
5209                 stripe_set_idx(sh->sector, conf, 0, sh);
5210                 schedule_reconstruction(sh, &s, 1, 1);
5211         } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
5212                 clear_bit(STRIPE_EXPAND_READY, &sh->state);
5213                 atomic_dec(&conf->reshape_stripes);
5214                 wake_up(&conf->wait_for_overlap);
5215                 md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
5216         }
5217
5218         if (s.expanding && s.locked == 0 &&
5219             !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
5220                 handle_stripe_expansion(conf, sh);
5221
5222 finish:
5223         /* wait for this device to become unblocked */
5224         if (unlikely(s.blocked_rdev)) {
5225                 if (conf->mddev->external)
5226                         md_wait_for_blocked_rdev(s.blocked_rdev,
5227                                                  conf->mddev);
5228                 else
5229                         /* Internal metadata will immediately
5230                          * be written by raid5d, so we don't
5231                          * need to wait here.
5232                          */
5233                         rdev_dec_pending(s.blocked_rdev,
5234                                          conf->mddev);
5235         }
5236
5237         if (s.handle_bad_blocks)
5238                 for (i = disks; i--; ) {
5239                         struct md_rdev *rdev;
5240                         struct r5dev *dev = &sh->dev[i];
5241                         if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
5242                                 /* We own a safe reference to the rdev */
5243                                 rdev = rdev_pend_deref(conf->disks[i].rdev);
5244                                 if (!rdev_set_badblocks(rdev, sh->sector,
5245                                                         RAID5_STRIPE_SECTORS(conf), 0))
5246                                         md_error(conf->mddev, rdev);
5247                                 rdev_dec_pending(rdev, conf->mddev);
5248                         }
5249                         if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
5250                                 rdev = rdev_pend_deref(conf->disks[i].rdev);
5251                                 rdev_clear_badblocks(rdev, sh->sector,
5252                                                      RAID5_STRIPE_SECTORS(conf), 0);
5253                                 rdev_dec_pending(rdev, conf->mddev);
5254                         }
5255                         if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
5256                                 rdev = rdev_pend_deref(conf->disks[i].replacement);
5257                                 if (!rdev)
5258                                         /* rdev have been moved down */
5259                                         rdev = rdev_pend_deref(conf->disks[i].rdev);
5260                                 rdev_clear_badblocks(rdev, sh->sector,
5261                                                      RAID5_STRIPE_SECTORS(conf), 0);
5262                                 rdev_dec_pending(rdev, conf->mddev);
5263                         }
5264                 }
5265
5266         if (s.ops_request)
5267                 raid_run_ops(sh, s.ops_request);
5268
5269         ops_run_io(sh, &s);
5270
5271         if (s.dec_preread_active) {
5272                 /* We delay this until after ops_run_io so that if make_request
5273                  * is waiting on a flush, it won't continue until the writes
5274                  * have actually been submitted.
5275                  */
5276                 atomic_dec(&conf->preread_active_stripes);
5277                 if (atomic_read(&conf->preread_active_stripes) <
5278                     IO_THRESHOLD)
5279                         md_wakeup_thread(conf->mddev->thread);
5280         }
5281
5282         clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
5283 }
5284
5285 static void raid5_activate_delayed(struct r5conf *conf)
5286         __must_hold(&conf->device_lock)
5287 {
5288         if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
5289                 while (!list_empty(&conf->delayed_list)) {
5290                         struct list_head *l = conf->delayed_list.next;
5291                         struct stripe_head *sh;
5292                         sh = list_entry(l, struct stripe_head, lru);
5293                         list_del_init(l);
5294                         clear_bit(STRIPE_DELAYED, &sh->state);
5295                         if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5296                                 atomic_inc(&conf->preread_active_stripes);
5297                         list_add_tail(&sh->lru, &conf->hold_list);
5298                         raid5_wakeup_stripe_thread(sh);
5299                 }
5300         }
5301 }
5302
5303 static void activate_bit_delay(struct r5conf *conf,
5304                 struct list_head *temp_inactive_list)
5305         __must_hold(&conf->device_lock)
5306 {
5307         struct list_head head;
5308         list_add(&head, &conf->bitmap_list);
5309         list_del_init(&conf->bitmap_list);
5310         while (!list_empty(&head)) {
5311                 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
5312                 int hash;
5313                 list_del_init(&sh->lru);
5314                 atomic_inc(&sh->count);
5315                 hash = sh->hash_lock_index;
5316                 __release_stripe(conf, sh, &temp_inactive_list[hash]);
5317         }
5318 }
5319
5320 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
5321 {
5322         struct r5conf *conf = mddev->private;
5323         sector_t sector = bio->bi_iter.bi_sector;
5324         unsigned int chunk_sectors;
5325         unsigned int bio_sectors = bio_sectors(bio);
5326
5327         chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
5328         return  chunk_sectors >=
5329                 ((sector & (chunk_sectors - 1)) + bio_sectors);
5330 }
5331
5332 /*
5333  *  add bio to the retry LIFO  ( in O(1) ... we are in interrupt )
5334  *  later sampled by raid5d.
5335  */
5336 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
5337 {
5338         unsigned long flags;
5339
5340         spin_lock_irqsave(&conf->device_lock, flags);
5341
5342         bi->bi_next = conf->retry_read_aligned_list;
5343         conf->retry_read_aligned_list = bi;
5344
5345         spin_unlock_irqrestore(&conf->device_lock, flags);
5346         md_wakeup_thread(conf->mddev->thread);
5347 }
5348
5349 static struct bio *remove_bio_from_retry(struct r5conf *conf,
5350                                          unsigned int *offset)
5351 {
5352         struct bio *bi;
5353
5354         bi = conf->retry_read_aligned;
5355         if (bi) {
5356                 *offset = conf->retry_read_offset;
5357                 conf->retry_read_aligned = NULL;
5358                 return bi;
5359         }
5360         bi = conf->retry_read_aligned_list;
5361         if(bi) {
5362                 conf->retry_read_aligned_list = bi->bi_next;
5363                 bi->bi_next = NULL;
5364                 *offset = 0;
5365         }
5366
5367         return bi;
5368 }
5369
5370 /*
5371  *  The "raid5_align_endio" should check if the read succeeded and if it
5372  *  did, call bio_endio on the original bio (having bio_put the new bio
5373  *  first).
5374  *  If the read failed..
5375  */
5376 static void raid5_align_endio(struct bio *bi)
5377 {
5378         struct md_io_acct *md_io_acct = bi->bi_private;
5379         struct bio *raid_bi = md_io_acct->orig_bio;
5380         struct mddev *mddev;
5381         struct r5conf *conf;
5382         struct md_rdev *rdev;
5383         blk_status_t error = bi->bi_status;
5384         unsigned long start_time = md_io_acct->start_time;
5385
5386         bio_put(bi);
5387
5388         rdev = (void*)raid_bi->bi_next;
5389         raid_bi->bi_next = NULL;
5390         mddev = rdev->mddev;
5391         conf = mddev->private;
5392
5393         rdev_dec_pending(rdev, conf->mddev);
5394
5395         if (!error) {
5396                 if (blk_queue_io_stat(raid_bi->bi_bdev->bd_disk->queue))
5397                         bio_end_io_acct(raid_bi, start_time);
5398                 bio_endio(raid_bi);
5399                 if (atomic_dec_and_test(&conf->active_aligned_reads))
5400                         wake_up(&conf->wait_for_quiescent);
5401                 return;
5402         }
5403
5404         pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
5405
5406         add_bio_to_retry(raid_bi, conf);
5407 }
5408
5409 static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
5410 {
5411         struct r5conf *conf = mddev->private;
5412         struct bio *align_bio;
5413         struct md_rdev *rdev;
5414         sector_t sector, end_sector, first_bad;
5415         int bad_sectors, dd_idx;
5416         struct md_io_acct *md_io_acct;
5417         bool did_inc;
5418
5419         if (!in_chunk_boundary(mddev, raid_bio)) {
5420                 pr_debug("%s: non aligned\n", __func__);
5421                 return 0;
5422         }
5423
5424         sector = raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 0,
5425                                       &dd_idx, NULL);
5426         end_sector = bio_end_sector(raid_bio);
5427
5428         rcu_read_lock();
5429         if (r5c_big_stripe_cached(conf, sector))
5430                 goto out_rcu_unlock;
5431
5432         rdev = rcu_dereference(conf->disks[dd_idx].replacement);
5433         if (!rdev || test_bit(Faulty, &rdev->flags) ||
5434             rdev->recovery_offset < end_sector) {
5435                 rdev = rcu_dereference(conf->disks[dd_idx].rdev);
5436                 if (!rdev)
5437                         goto out_rcu_unlock;
5438                 if (test_bit(Faulty, &rdev->flags) ||
5439                     !(test_bit(In_sync, &rdev->flags) ||
5440                       rdev->recovery_offset >= end_sector))
5441                         goto out_rcu_unlock;
5442         }
5443
5444         atomic_inc(&rdev->nr_pending);
5445         rcu_read_unlock();
5446
5447         if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad,
5448                         &bad_sectors)) {
5449                 bio_put(raid_bio);
5450                 rdev_dec_pending(rdev, mddev);
5451                 return 0;
5452         }
5453
5454         align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
5455                                     &mddev->io_acct_set);
5456         md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone);
5457         raid_bio->bi_next = (void *)rdev;
5458         if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
5459                 md_io_acct->start_time = bio_start_io_acct(raid_bio);
5460         md_io_acct->orig_bio = raid_bio;
5461
5462         align_bio->bi_end_io = raid5_align_endio;
5463         align_bio->bi_private = md_io_acct;
5464         align_bio->bi_iter.bi_sector = sector;
5465
5466         /* No reshape active, so we can trust rdev->data_offset */
5467         align_bio->bi_iter.bi_sector += rdev->data_offset;
5468
5469         did_inc = false;
5470         if (conf->quiesce == 0) {
5471                 atomic_inc(&conf->active_aligned_reads);
5472                 did_inc = true;
5473         }
5474         /* need a memory barrier to detect the race with raid5_quiesce() */
5475         if (!did_inc || smp_load_acquire(&conf->quiesce) != 0) {
5476                 /* quiesce is in progress, so we need to undo io activation and wait
5477                  * for it to finish
5478                  */
5479                 if (did_inc && atomic_dec_and_test(&conf->active_aligned_reads))
5480                         wake_up(&conf->wait_for_quiescent);
5481                 spin_lock_irq(&conf->device_lock);
5482                 wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0,
5483                                     conf->device_lock);
5484                 atomic_inc(&conf->active_aligned_reads);
5485                 spin_unlock_irq(&conf->device_lock);
5486         }
5487
5488         if (mddev->gendisk)
5489                 trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk),
5490                                       raid_bio->bi_iter.bi_sector);
5491         submit_bio_noacct(align_bio);
5492         return 1;
5493
5494 out_rcu_unlock:
5495         rcu_read_unlock();
5496         return 0;
5497 }
5498
5499 static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
5500 {
5501         struct bio *split;
5502         sector_t sector = raid_bio->bi_iter.bi_sector;
5503         unsigned chunk_sects = mddev->chunk_sectors;
5504         unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
5505
5506         if (sectors < bio_sectors(raid_bio)) {
5507                 struct r5conf *conf = mddev->private;
5508                 split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split);
5509                 bio_chain(split, raid_bio);
5510                 submit_bio_noacct(raid_bio);
5511                 raid_bio = split;
5512         }
5513
5514         if (!raid5_read_one_chunk(mddev, raid_bio))
5515                 return raid_bio;
5516
5517         return NULL;
5518 }
5519
5520 /* __get_priority_stripe - get the next stripe to process
5521  *
5522  * Full stripe writes are allowed to pass preread active stripes up until
5523  * the bypass_threshold is exceeded.  In general the bypass_count
5524  * increments when the handle_list is handled before the hold_list; however, it
5525  * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
5526  * stripe with in flight i/o.  The bypass_count will be reset when the
5527  * head of the hold_list has changed, i.e. the head was promoted to the
5528  * handle_list.
5529  */
5530 static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
5531         __must_hold(&conf->device_lock)
5532 {
5533         struct stripe_head *sh, *tmp;
5534         struct list_head *handle_list = NULL;
5535         struct r5worker_group *wg;
5536         bool second_try = !r5c_is_writeback(conf->log) &&
5537                 !r5l_log_disk_error(conf);
5538         bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) ||
5539                 r5l_log_disk_error(conf);
5540
5541 again:
5542         wg = NULL;
5543         sh = NULL;
5544         if (conf->worker_cnt_per_group == 0) {
5545                 handle_list = try_loprio ? &conf->loprio_list :
5546                                         &conf->handle_list;
5547         } else if (group != ANY_GROUP) {
5548                 handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
5549                                 &conf->worker_groups[group].handle_list;
5550                 wg = &conf->worker_groups[group];
5551         } else {
5552                 int i;
5553                 for (i = 0; i < conf->group_cnt; i++) {
5554                         handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
5555                                 &conf->worker_groups[i].handle_list;
5556                         wg = &conf->worker_groups[i];
5557                         if (!list_empty(handle_list))
5558                                 break;
5559                 }
5560         }
5561
5562         pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
5563                   __func__,
5564                   list_empty(handle_list) ? "empty" : "busy",
5565                   list_empty(&conf->hold_list) ? "empty" : "busy",
5566                   atomic_read(&conf->pending_full_writes), conf->bypass_count);
5567
5568         if (!list_empty(handle_list)) {
5569                 sh = list_entry(handle_list->next, typeof(*sh), lru);
5570
5571                 if (list_empty(&conf->hold_list))
5572                         conf->bypass_count = 0;
5573                 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
5574                         if (conf->hold_list.next == conf->last_hold)
5575                                 conf->bypass_count++;
5576                         else {
5577                                 conf->last_hold = conf->hold_list.next;
5578                                 conf->bypass_count -= conf->bypass_threshold;
5579                                 if (conf->bypass_count < 0)
5580                                         conf->bypass_count = 0;
5581                         }
5582                 }
5583         } else if (!list_empty(&conf->hold_list) &&
5584                    ((conf->bypass_threshold &&
5585                      conf->bypass_count > conf->bypass_threshold) ||
5586                     atomic_read(&conf->pending_full_writes) == 0)) {
5587
5588                 list_for_each_entry(tmp, &conf->hold_list,  lru) {
5589                         if (conf->worker_cnt_per_group == 0 ||
5590                             group == ANY_GROUP ||
5591                             !cpu_online(tmp->cpu) ||
5592                             cpu_to_group(tmp->cpu) == group) {
5593                                 sh = tmp;
5594                                 break;
5595                         }
5596                 }
5597
5598                 if (sh) {
5599                         conf->bypass_count -= conf->bypass_threshold;
5600                         if (conf->bypass_count < 0)
5601                                 conf->bypass_count = 0;
5602                 }
5603                 wg = NULL;
5604         }
5605
5606         if (!sh) {
5607                 if (second_try)
5608                         return NULL;
5609                 second_try = true;
5610                 try_loprio = !try_loprio;
5611                 goto again;
5612         }
5613
5614         if (wg) {
5615                 wg->stripes_cnt--;
5616                 sh->group = NULL;
5617         }
5618         list_del_init(&sh->lru);
5619         BUG_ON(atomic_inc_return(&sh->count) != 1);
5620         return sh;
5621 }
5622
5623 struct raid5_plug_cb {
5624         struct blk_plug_cb      cb;
5625         struct list_head        list;
5626         struct list_head        temp_inactive_list[NR_STRIPE_HASH_LOCKS];
5627 };
5628
5629 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
5630 {
5631         struct raid5_plug_cb *cb = container_of(
5632                 blk_cb, struct raid5_plug_cb, cb);
5633         struct stripe_head *sh;
5634         struct mddev *mddev = cb->cb.data;
5635         struct r5conf *conf = mddev->private;
5636         int cnt = 0;
5637         int hash;
5638
5639         if (cb->list.next && !list_empty(&cb->list)) {
5640                 spin_lock_irq(&conf->device_lock);
5641                 while (!list_empty(&cb->list)) {
5642                         sh = list_first_entry(&cb->list, struct stripe_head, lru);
5643                         list_del_init(&sh->lru);
5644                         /*
5645                          * avoid race release_stripe_plug() sees
5646                          * STRIPE_ON_UNPLUG_LIST clear but the stripe
5647                          * is still in our list
5648                          */
5649                         smp_mb__before_atomic();
5650                         clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
5651                         /*
5652                          * STRIPE_ON_RELEASE_LIST could be set here. In that
5653                          * case, the count is always > 1 here
5654                          */
5655                         hash = sh->hash_lock_index;
5656                         __release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
5657                         cnt++;
5658                 }
5659                 spin_unlock_irq(&conf->device_lock);
5660         }
5661         release_inactive_stripe_list(conf, cb->temp_inactive_list,
5662                                      NR_STRIPE_HASH_LOCKS);
5663         if (mddev->queue)
5664                 trace_block_unplug(mddev->queue, cnt, !from_schedule);
5665         kfree(cb);
5666 }
5667
5668 static void release_stripe_plug(struct mddev *mddev,
5669                                 struct stripe_head *sh)
5670 {
5671         struct blk_plug_cb *blk_cb = blk_check_plugged(
5672                 raid5_unplug, mddev,
5673                 sizeof(struct raid5_plug_cb));
5674         struct raid5_plug_cb *cb;
5675
5676         if (!blk_cb) {
5677                 raid5_release_stripe(sh);
5678                 return;
5679         }
5680
5681         cb = container_of(blk_cb, struct raid5_plug_cb, cb);
5682
5683         if (cb->list.next == NULL) {
5684                 int i;
5685                 INIT_LIST_HEAD(&cb->list);
5686                 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5687                         INIT_LIST_HEAD(cb->temp_inactive_list + i);
5688         }
5689
5690         if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
5691                 list_add_tail(&sh->lru, &cb->list);
5692         else
5693                 raid5_release_stripe(sh);
5694 }
5695
5696 static void make_discard_request(struct mddev *mddev, struct bio *bi)
5697 {
5698         struct r5conf *conf = mddev->private;
5699         sector_t logical_sector, last_sector;
5700         struct stripe_head *sh;
5701         int stripe_sectors;
5702
5703         /* We need to handle this when io_uring supports discard/trim */
5704         if (WARN_ON_ONCE(bi->bi_opf & REQ_NOWAIT))
5705                 return;
5706
5707         if (mddev->reshape_position != MaxSector)
5708                 /* Skip discard while reshape is happening */
5709                 return;
5710
5711         logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
5712         last_sector = bio_end_sector(bi);
5713
5714         bi->bi_next = NULL;
5715
5716         stripe_sectors = conf->chunk_sectors *
5717                 (conf->raid_disks - conf->max_degraded);
5718         logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
5719                                                stripe_sectors);
5720         sector_div(last_sector, stripe_sectors);
5721
5722         logical_sector *= conf->chunk_sectors;
5723         last_sector *= conf->chunk_sectors;
5724
5725         for (; logical_sector < last_sector;
5726              logical_sector += RAID5_STRIPE_SECTORS(conf)) {
5727                 DEFINE_WAIT(w);
5728                 int d;
5729         again:
5730                 sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
5731                 prepare_to_wait(&conf->wait_for_overlap, &w,
5732                                 TASK_UNINTERRUPTIBLE);
5733                 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5734                 if (test_bit(STRIPE_SYNCING, &sh->state)) {
5735                         raid5_release_stripe(sh);
5736                         schedule();
5737                         goto again;
5738                 }
5739                 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5740                 spin_lock_irq(&sh->stripe_lock);
5741                 for (d = 0; d < conf->raid_disks; d++) {
5742                         if (d == sh->pd_idx || d == sh->qd_idx)
5743                                 continue;
5744                         if (sh->dev[d].towrite || sh->dev[d].toread) {
5745                                 set_bit(R5_Overlap, &sh->dev[d].flags);
5746                                 spin_unlock_irq(&sh->stripe_lock);
5747                                 raid5_release_stripe(sh);
5748                                 schedule();
5749                                 goto again;
5750                         }
5751                 }
5752                 set_bit(STRIPE_DISCARD, &sh->state);
5753                 finish_wait(&conf->wait_for_overlap, &w);
5754                 sh->overwrite_disks = 0;
5755                 for (d = 0; d < conf->raid_disks; d++) {
5756                         if (d == sh->pd_idx || d == sh->qd_idx)
5757                                 continue;
5758                         sh->dev[d].towrite = bi;
5759                         set_bit(R5_OVERWRITE, &sh->dev[d].flags);
5760                         bio_inc_remaining(bi);
5761                         md_write_inc(mddev, bi);
5762                         sh->overwrite_disks++;
5763                 }
5764                 spin_unlock_irq(&sh->stripe_lock);
5765                 if (conf->mddev->bitmap) {
5766                         for (d = 0;
5767                              d < conf->raid_disks - conf->max_degraded;
5768                              d++)
5769                                 md_bitmap_startwrite(mddev->bitmap,
5770                                                      sh->sector,
5771                                                      RAID5_STRIPE_SECTORS(conf),
5772                                                      0);
5773                         sh->bm_seq = conf->seq_flush + 1;
5774                         set_bit(STRIPE_BIT_DELAY, &sh->state);
5775                 }
5776
5777                 set_bit(STRIPE_HANDLE, &sh->state);
5778                 clear_bit(STRIPE_DELAYED, &sh->state);
5779                 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5780                         atomic_inc(&conf->preread_active_stripes);
5781                 release_stripe_plug(mddev, sh);
5782         }
5783
5784         bio_endio(bi);
5785 }
5786
5787 static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
5788 {
5789         struct r5conf *conf = mddev->private;
5790         int dd_idx;
5791         sector_t new_sector;
5792         sector_t logical_sector, last_sector;
5793         struct stripe_head *sh;
5794         const int rw = bio_data_dir(bi);
5795         DEFINE_WAIT(w);
5796         bool do_prepare;
5797         bool do_flush = false;
5798
5799         if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
5800                 int ret = log_handle_flush_request(conf, bi);
5801
5802                 if (ret == 0)
5803                         return true;
5804                 if (ret == -ENODEV) {
5805                         if (md_flush_request(mddev, bi))
5806                                 return true;
5807                 }
5808                 /* ret == -EAGAIN, fallback */
5809                 /*
5810                  * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
5811                  * we need to flush journal device
5812                  */
5813                 do_flush = bi->bi_opf & REQ_PREFLUSH;
5814         }
5815
5816         if (!md_write_start(mddev, bi))
5817                 return false;
5818         /*
5819          * If array is degraded, better not do chunk aligned read because
5820          * later we might have to read it again in order to reconstruct
5821          * data on failed drives.
5822          */
5823         if (rw == READ && mddev->degraded == 0 &&
5824             mddev->reshape_position == MaxSector) {
5825                 bi = chunk_aligned_read(mddev, bi);
5826                 if (!bi)
5827                         return true;
5828         }
5829
5830         if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) {
5831                 make_discard_request(mddev, bi);
5832                 md_write_end(mddev);
5833                 return true;
5834         }
5835
5836         logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
5837         last_sector = bio_end_sector(bi);
5838         bi->bi_next = NULL;
5839
5840         /* Bail out if conflicts with reshape and REQ_NOWAIT is set */
5841         if ((bi->bi_opf & REQ_NOWAIT) &&
5842             (conf->reshape_progress != MaxSector) &&
5843             (mddev->reshape_backwards
5844             ? (logical_sector > conf->reshape_progress && logical_sector <= conf->reshape_safe)
5845             : (logical_sector >= conf->reshape_safe && logical_sector < conf->reshape_progress))) {
5846                 bio_wouldblock_error(bi);
5847                 if (rw == WRITE)
5848                         md_write_end(mddev);
5849                 return true;
5850         }
5851         md_account_bio(mddev, &bi);
5852         prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
5853         for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) {
5854                 int previous;
5855                 int seq;
5856
5857                 do_prepare = false;
5858         retry:
5859                 seq = read_seqcount_begin(&conf->gen_lock);
5860                 previous = 0;
5861                 if (do_prepare)
5862                         prepare_to_wait(&conf->wait_for_overlap, &w,
5863                                 TASK_UNINTERRUPTIBLE);
5864                 if (unlikely(conf->reshape_progress != MaxSector)) {
5865                         /* spinlock is needed as reshape_progress may be
5866                          * 64bit on a 32bit platform, and so it might be
5867                          * possible to see a half-updated value
5868                          * Of course reshape_progress could change after
5869                          * the lock is dropped, so once we get a reference
5870                          * to the stripe that we think it is, we will have
5871                          * to check again.
5872                          */
5873                         spin_lock_irq(&conf->device_lock);
5874                         if (mddev->reshape_backwards
5875                             ? logical_sector < conf->reshape_progress
5876                             : logical_sector >= conf->reshape_progress) {
5877                                 previous = 1;
5878                         } else {
5879                                 if (mddev->reshape_backwards
5880                                     ? logical_sector < conf->reshape_safe
5881                                     : logical_sector >= conf->reshape_safe) {
5882                                         spin_unlock_irq(&conf->device_lock);
5883                                         schedule();
5884                                         do_prepare = true;
5885                                         goto retry;
5886                                 }
5887                         }
5888                         spin_unlock_irq(&conf->device_lock);
5889                 }
5890
5891                 new_sector = raid5_compute_sector(conf, logical_sector,
5892                                                   previous,
5893                                                   &dd_idx, NULL);
5894                 pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n",
5895                         (unsigned long long)new_sector,
5896                         (unsigned long long)logical_sector);
5897
5898                 sh = raid5_get_active_stripe(conf, new_sector, previous,
5899                                        (bi->bi_opf & REQ_RAHEAD), 0);
5900                 if (sh) {
5901                         if (unlikely(previous)) {
5902                                 /* expansion might have moved on while waiting for a
5903                                  * stripe, so we must do the range check again.
5904                                  * Expansion could still move past after this
5905                                  * test, but as we are holding a reference to
5906                                  * 'sh', we know that if that happens,
5907                                  *  STRIPE_EXPANDING will get set and the expansion
5908                                  * won't proceed until we finish with the stripe.
5909                                  */
5910                                 int must_retry = 0;
5911                                 spin_lock_irq(&conf->device_lock);
5912                                 if (mddev->reshape_backwards
5913                                     ? logical_sector >= conf->reshape_progress
5914                                     : logical_sector < conf->reshape_progress)
5915                                         /* mismatch, need to try again */
5916                                         must_retry = 1;
5917                                 spin_unlock_irq(&conf->device_lock);
5918                                 if (must_retry) {
5919                                         raid5_release_stripe(sh);
5920                                         schedule();
5921                                         do_prepare = true;
5922                                         goto retry;
5923                                 }
5924                         }
5925                         if (read_seqcount_retry(&conf->gen_lock, seq)) {
5926                                 /* Might have got the wrong stripe_head
5927                                  * by accident
5928                                  */
5929                                 raid5_release_stripe(sh);
5930                                 goto retry;
5931                         }
5932
5933                         if (test_bit(STRIPE_EXPANDING, &sh->state) ||
5934                             !add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
5935                                 /* Stripe is busy expanding or
5936                                  * add failed due to overlap.  Flush everything
5937                                  * and wait a while
5938                                  */
5939                                 md_wakeup_thread(mddev->thread);
5940                                 raid5_release_stripe(sh);
5941                                 schedule();
5942                                 do_prepare = true;
5943                                 goto retry;
5944                         }
5945                         if (do_flush) {
5946                                 set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
5947                                 /* we only need flush for one stripe */
5948                                 do_flush = false;
5949                         }
5950
5951                         set_bit(STRIPE_HANDLE, &sh->state);
5952                         clear_bit(STRIPE_DELAYED, &sh->state);
5953                         if ((!sh->batch_head || sh == sh->batch_head) &&
5954                             (bi->bi_opf & REQ_SYNC) &&
5955                             !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5956                                 atomic_inc(&conf->preread_active_stripes);
5957                         release_stripe_plug(mddev, sh);
5958                 } else {
5959                         /* cannot get stripe for read-ahead, just give-up */
5960                         bi->bi_status = BLK_STS_IOERR;
5961                         break;
5962                 }
5963         }
5964         finish_wait(&conf->wait_for_overlap, &w);
5965
5966         if (rw == WRITE)
5967                 md_write_end(mddev);
5968         bio_endio(bi);
5969         return true;
5970 }
5971
5972 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
5973
5974 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
5975 {
5976         /* reshaping is quite different to recovery/resync so it is
5977          * handled quite separately ... here.
5978          *
5979          * On each call to sync_request, we gather one chunk worth of
5980          * destination stripes and flag them as expanding.
5981          * Then we find all the source stripes and request reads.
5982          * As the reads complete, handle_stripe will copy the data
5983          * into the destination stripe and release that stripe.
5984          */
5985         struct r5conf *conf = mddev->private;
5986         struct stripe_head *sh;
5987         struct md_rdev *rdev;
5988         sector_t first_sector, last_sector;
5989         int raid_disks = conf->previous_raid_disks;
5990         int data_disks = raid_disks - conf->max_degraded;
5991         int new_data_disks = conf->raid_disks - conf->max_degraded;
5992         int i;
5993         int dd_idx;
5994         sector_t writepos, readpos, safepos;
5995         sector_t stripe_addr;
5996         int reshape_sectors;
5997         struct list_head stripes;
5998         sector_t retn;
5999
6000         if (sector_nr == 0) {
6001                 /* If restarting in the middle, skip the initial sectors */
6002                 if (mddev->reshape_backwards &&
6003                     conf->reshape_progress < raid5_size(mddev, 0, 0)) {
6004                         sector_nr = raid5_size(mddev, 0, 0)
6005                                 - conf->reshape_progress;
6006                 } else if (mddev->reshape_backwards &&
6007                            conf->reshape_progress == MaxSector) {
6008                         /* shouldn't happen, but just in case, finish up.*/
6009                         sector_nr = MaxSector;
6010                 } else if (!mddev->reshape_backwards &&
6011                            conf->reshape_progress > 0)
6012                         sector_nr = conf->reshape_progress;
6013                 sector_div(sector_nr, new_data_disks);
6014                 if (sector_nr) {
6015                         mddev->curr_resync_completed = sector_nr;
6016                         sysfs_notify_dirent_safe(mddev->sysfs_completed);
6017                         *skipped = 1;
6018                         retn = sector_nr;
6019                         goto finish;
6020                 }
6021         }
6022
6023         /* We need to process a full chunk at a time.
6024          * If old and new chunk sizes differ, we need to process the
6025          * largest of these
6026          */
6027
6028         reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors);
6029
6030         /* We update the metadata at least every 10 seconds, or when
6031          * the data about to be copied would over-write the source of
6032          * the data at the front of the range.  i.e. one new_stripe
6033          * along from reshape_progress new_maps to after where
6034          * reshape_safe old_maps to
6035          */
6036         writepos = conf->reshape_progress;
6037         sector_div(writepos, new_data_disks);
6038         readpos = conf->reshape_progress;
6039         sector_div(readpos, data_disks);
6040         safepos = conf->reshape_safe;
6041         sector_div(safepos, data_disks);
6042         if (mddev->reshape_backwards) {
6043                 BUG_ON(writepos < reshape_sectors);
6044                 writepos -= reshape_sectors;
6045                 readpos += reshape_sectors;
6046                 safepos += reshape_sectors;
6047         } else {
6048                 writepos += reshape_sectors;
6049                 /* readpos and safepos are worst-case calculations.
6050                  * A negative number is overly pessimistic, and causes
6051                  * obvious problems for unsigned storage.  So clip to 0.
6052                  */
6053                 readpos -= min_t(sector_t, reshape_sectors, readpos);
6054                 safepos -= min_t(sector_t, reshape_sectors, safepos);
6055         }
6056
6057         /* Having calculated the 'writepos' possibly use it
6058          * to set 'stripe_addr' which is where we will write to.
6059          */
6060         if (mddev->reshape_backwards) {
6061                 BUG_ON(conf->reshape_progress == 0);
6062                 stripe_addr = writepos;
6063                 BUG_ON((mddev->dev_sectors &
6064                         ~((sector_t)reshape_sectors - 1))
6065                        - reshape_sectors - stripe_addr
6066                        != sector_nr);
6067         } else {
6068                 BUG_ON(writepos != sector_nr + reshape_sectors);
6069                 stripe_addr = sector_nr;
6070         }
6071
6072         /* 'writepos' is the most advanced device address we might write.
6073          * 'readpos' is the least advanced device address we might read.
6074          * 'safepos' is the least address recorded in the metadata as having
6075          *     been reshaped.
6076          * If there is a min_offset_diff, these are adjusted either by
6077          * increasing the safepos/readpos if diff is negative, or
6078          * increasing writepos if diff is positive.
6079          * If 'readpos' is then behind 'writepos', there is no way that we can
6080          * ensure safety in the face of a crash - that must be done by userspace
6081          * making a backup of the data.  So in that case there is no particular
6082          * rush to update metadata.
6083          * Otherwise if 'safepos' is behind 'writepos', then we really need to
6084          * update the metadata to advance 'safepos' to match 'readpos' so that
6085          * we can be safe in the event of a crash.
6086          * So we insist on updating metadata if safepos is behind writepos and
6087          * readpos is beyond writepos.
6088          * In any case, update the metadata every 10 seconds.
6089          * Maybe that number should be configurable, but I'm not sure it is
6090          * worth it.... maybe it could be a multiple of safemode_delay???
6091          */
6092         if (conf->min_offset_diff < 0) {
6093                 safepos += -conf->min_offset_diff;
6094                 readpos += -conf->min_offset_diff;
6095         } else
6096                 writepos += conf->min_offset_diff;
6097
6098         if ((mddev->reshape_backwards
6099              ? (safepos > writepos && readpos < writepos)
6100              : (safepos < writepos && readpos > writepos)) ||
6101             time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
6102                 /* Cannot proceed until we've updated the superblock... */
6103                 wait_event(conf->wait_for_overlap,
6104                            atomic_read(&conf->reshape_stripes)==0
6105                            || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6106                 if (atomic_read(&conf->reshape_stripes) != 0)
6107                         return 0;
6108                 mddev->reshape_position = conf->reshape_progress;
6109                 mddev->curr_resync_completed = sector_nr;
6110                 if (!mddev->reshape_backwards)
6111                         /* Can update recovery_offset */
6112                         rdev_for_each(rdev, mddev)
6113                                 if (rdev->raid_disk >= 0 &&
6114                                     !test_bit(Journal, &rdev->flags) &&
6115                                     !test_bit(In_sync, &rdev->flags) &&
6116                                     rdev->recovery_offset < sector_nr)
6117                                         rdev->recovery_offset = sector_nr;
6118
6119                 conf->reshape_checkpoint = jiffies;
6120                 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6121                 md_wakeup_thread(mddev->thread);
6122                 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
6123                            test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6124                 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6125                         return 0;
6126                 spin_lock_irq(&conf->device_lock);
6127                 conf->reshape_safe = mddev->reshape_position;
6128                 spin_unlock_irq(&conf->device_lock);
6129                 wake_up(&conf->wait_for_overlap);
6130                 sysfs_notify_dirent_safe(mddev->sysfs_completed);
6131         }
6132
6133         INIT_LIST_HEAD(&stripes);
6134         for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) {
6135                 int j;
6136                 int skipped_disk = 0;
6137                 sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
6138                 set_bit(STRIPE_EXPANDING, &sh->state);
6139                 atomic_inc(&conf->reshape_stripes);
6140                 /* If any of this stripe is beyond the end of the old
6141                  * array, then we need to zero those blocks
6142                  */
6143                 for (j=sh->disks; j--;) {
6144                         sector_t s;
6145                         if (j == sh->pd_idx)
6146                                 continue;
6147                         if (conf->level == 6 &&
6148                             j == sh->qd_idx)
6149                                 continue;
6150                         s = raid5_compute_blocknr(sh, j, 0);
6151                         if (s < raid5_size(mddev, 0, 0)) {
6152                                 skipped_disk = 1;
6153                                 continue;
6154                         }
6155                         memset(page_address(sh->dev[j].page), 0, RAID5_STRIPE_SIZE(conf));
6156                         set_bit(R5_Expanded, &sh->dev[j].flags);
6157                         set_bit(R5_UPTODATE, &sh->dev[j].flags);
6158                 }
6159                 if (!skipped_disk) {
6160                         set_bit(STRIPE_EXPAND_READY, &sh->state);
6161                         set_bit(STRIPE_HANDLE, &sh->state);
6162                 }
6163                 list_add(&sh->lru, &stripes);
6164         }
6165         spin_lock_irq(&conf->device_lock);
6166         if (mddev->reshape_backwards)
6167                 conf->reshape_progress -= reshape_sectors * new_data_disks;
6168         else
6169                 conf->reshape_progress += reshape_sectors * new_data_disks;
6170         spin_unlock_irq(&conf->device_lock);
6171         /* Ok, those stripe are ready. We can start scheduling
6172          * reads on the source stripes.
6173          * The source stripes are determined by mapping the first and last
6174          * block on the destination stripes.
6175          */
6176         first_sector =
6177                 raid5_compute_sector(conf, stripe_addr*(new_data_disks),
6178                                      1, &dd_idx, NULL);
6179         last_sector =
6180                 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
6181                                             * new_data_disks - 1),
6182                                      1, &dd_idx, NULL);
6183         if (last_sector >= mddev->dev_sectors)
6184                 last_sector = mddev->dev_sectors - 1;
6185         while (first_sector <= last_sector) {
6186                 sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1);
6187                 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
6188                 set_bit(STRIPE_HANDLE, &sh->state);
6189                 raid5_release_stripe(sh);
6190                 first_sector += RAID5_STRIPE_SECTORS(conf);
6191         }
6192         /* Now that the sources are clearly marked, we can release
6193          * the destination stripes
6194          */
6195         while (!list_empty(&stripes)) {
6196                 sh = list_entry(stripes.next, struct stripe_head, lru);
6197                 list_del_init(&sh->lru);
6198                 raid5_release_stripe(sh);
6199         }
6200         /* If this takes us to the resync_max point where we have to pause,
6201          * then we need to write out the superblock.
6202          */
6203         sector_nr += reshape_sectors;
6204         retn = reshape_sectors;
6205 finish:
6206         if (mddev->curr_resync_completed > mddev->resync_max ||
6207             (sector_nr - mddev->curr_resync_completed) * 2
6208             >= mddev->resync_max - mddev->curr_resync_completed) {
6209                 /* Cannot proceed until we've updated the superblock... */
6210                 wait_event(conf->wait_for_overlap,
6211                            atomic_read(&conf->reshape_stripes) == 0
6212                            || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6213                 if (atomic_read(&conf->reshape_stripes) != 0)
6214                         goto ret;
6215                 mddev->reshape_position = conf->reshape_progress;
6216                 mddev->curr_resync_completed = sector_nr;
6217                 if (!mddev->reshape_backwards)
6218                         /* Can update recovery_offset */
6219                         rdev_for_each(rdev, mddev)
6220                                 if (rdev->raid_disk >= 0 &&
6221                                     !test_bit(Journal, &rdev->flags) &&
6222                                     !test_bit(In_sync, &rdev->flags) &&
6223                                     rdev->recovery_offset < sector_nr)
6224                                         rdev->recovery_offset = sector_nr;
6225                 conf->reshape_checkpoint = jiffies;
6226                 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6227                 md_wakeup_thread(mddev->thread);
6228                 wait_event(mddev->sb_wait,
6229                            !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)
6230                            || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6231                 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6232                         goto ret;
6233                 spin_lock_irq(&conf->device_lock);
6234                 conf->reshape_safe = mddev->reshape_position;
6235                 spin_unlock_irq(&conf->device_lock);
6236                 wake_up(&conf->wait_for_overlap);
6237                 sysfs_notify_dirent_safe(mddev->sysfs_completed);
6238         }
6239 ret:
6240         return retn;
6241 }
6242
6243 static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr,
6244                                           int *skipped)
6245 {
6246         struct r5conf *conf = mddev->private;
6247         struct stripe_head *sh;
6248         sector_t max_sector = mddev->dev_sectors;
6249         sector_t sync_blocks;
6250         int still_degraded = 0;
6251         int i;
6252
6253         if (sector_nr >= max_sector) {
6254                 /* just being told to finish up .. nothing much to do */
6255
6256                 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
6257                         end_reshape(conf);
6258                         return 0;
6259                 }
6260
6261                 if (mddev->curr_resync < max_sector) /* aborted */
6262                         md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
6263                                            &sync_blocks, 1);
6264                 else /* completed sync */
6265                         conf->fullsync = 0;
6266                 md_bitmap_close_sync(mddev->bitmap);
6267
6268                 return 0;
6269         }
6270
6271         /* Allow raid5_quiesce to complete */
6272         wait_event(conf->wait_for_overlap, conf->quiesce != 2);
6273
6274         if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6275                 return reshape_request(mddev, sector_nr, skipped);
6276
6277         /* No need to check resync_max as we never do more than one
6278          * stripe, and as resync_max will always be on a chunk boundary,
6279          * if the check in md_do_sync didn't fire, there is no chance
6280          * of overstepping resync_max here
6281          */
6282
6283         /* if there is too many failed drives and we are trying
6284          * to resync, then assert that we are finished, because there is
6285          * nothing we can do.
6286          */
6287         if (mddev->degraded >= conf->max_degraded &&
6288             test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6289                 sector_t rv = mddev->dev_sectors - sector_nr;
6290                 *skipped = 1;
6291                 return rv;
6292         }
6293         if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
6294             !conf->fullsync &&
6295             !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
6296             sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
6297                 /* we can skip this block, and probably more */
6298                 do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
6299                 *skipped = 1;
6300                 /* keep things rounded to whole stripes */
6301                 return sync_blocks * RAID5_STRIPE_SECTORS(conf);
6302         }
6303
6304         md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
6305
6306         sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0);
6307         if (sh == NULL) {
6308                 sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0);
6309                 /* make sure we don't swamp the stripe cache if someone else
6310                  * is trying to get access
6311                  */
6312                 schedule_timeout_uninterruptible(1);
6313         }
6314         /* Need to check if array will still be degraded after recovery/resync
6315          * Note in case of > 1 drive failures it's possible we're rebuilding
6316          * one drive while leaving another faulty drive in array.
6317          */
6318         rcu_read_lock();
6319         for (i = 0; i < conf->raid_disks; i++) {
6320                 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
6321
6322                 if (rdev == NULL || test_bit(Faulty, &rdev->flags))
6323                         still_degraded = 1;
6324         }
6325         rcu_read_unlock();
6326
6327         md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
6328
6329         set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
6330         set_bit(STRIPE_HANDLE, &sh->state);
6331
6332         raid5_release_stripe(sh);
6333
6334         return RAID5_STRIPE_SECTORS(conf);
6335 }
6336
6337 static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
6338                                unsigned int offset)
6339 {
6340         /* We may not be able to submit a whole bio at once as there
6341          * may not be enough stripe_heads available.
6342          * We cannot pre-allocate enough stripe_heads as we may need
6343          * more than exist in the cache (if we allow ever large chunks).
6344          * So we do one stripe head at a time and record in
6345          * ->bi_hw_segments how many have been done.
6346          *
6347          * We *know* that this entire raid_bio is in one chunk, so
6348          * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
6349          */
6350         struct stripe_head *sh;
6351         int dd_idx;
6352         sector_t sector, logical_sector, last_sector;
6353         int scnt = 0;
6354         int handled = 0;
6355
6356         logical_sector = raid_bio->bi_iter.bi_sector &
6357                 ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
6358         sector = raid5_compute_sector(conf, logical_sector,
6359                                       0, &dd_idx, NULL);
6360         last_sector = bio_end_sector(raid_bio);
6361
6362         for (; logical_sector < last_sector;
6363              logical_sector += RAID5_STRIPE_SECTORS(conf),
6364                      sector += RAID5_STRIPE_SECTORS(conf),
6365                      scnt++) {
6366
6367                 if (scnt < offset)
6368                         /* already done this stripe */
6369                         continue;
6370
6371                 sh = raid5_get_active_stripe(conf, sector, 0, 1, 1);
6372
6373                 if (!sh) {
6374                         /* failed to get a stripe - must wait */
6375                         conf->retry_read_aligned = raid_bio;
6376                         conf->retry_read_offset = scnt;
6377                         return handled;
6378                 }
6379
6380                 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
6381                         raid5_release_stripe(sh);
6382                         conf->retry_read_aligned = raid_bio;
6383                         conf->retry_read_offset = scnt;
6384                         return handled;
6385                 }
6386
6387                 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
6388                 handle_stripe(sh);
6389                 raid5_release_stripe(sh);
6390                 handled++;
6391         }
6392
6393         bio_endio(raid_bio);
6394
6395         if (atomic_dec_and_test(&conf->active_aligned_reads))
6396                 wake_up(&conf->wait_for_quiescent);
6397         return handled;
6398 }
6399
6400 static int handle_active_stripes(struct r5conf *conf, int group,
6401                                  struct r5worker *worker,
6402                                  struct list_head *temp_inactive_list)
6403                 __must_hold(&conf->device_lock)
6404 {
6405         struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
6406         int i, batch_size = 0, hash;
6407         bool release_inactive = false;
6408
6409         while (batch_size < MAX_STRIPE_BATCH &&
6410                         (sh = __get_priority_stripe(conf, group)) != NULL)
6411                 batch[batch_size++] = sh;
6412
6413         if (batch_size == 0) {
6414                 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6415                         if (!list_empty(temp_inactive_list + i))
6416                                 break;
6417                 if (i == NR_STRIPE_HASH_LOCKS) {
6418                         spin_unlock_irq(&conf->device_lock);
6419                         log_flush_stripe_to_raid(conf);
6420                         spin_lock_irq(&conf->device_lock);
6421                         return batch_size;
6422                 }
6423                 release_inactive = true;
6424         }
6425         spin_unlock_irq(&conf->device_lock);
6426
6427         release_inactive_stripe_list(conf, temp_inactive_list,
6428                                      NR_STRIPE_HASH_LOCKS);
6429
6430         r5l_flush_stripe_to_raid(conf->log);
6431         if (release_inactive) {
6432                 spin_lock_irq(&conf->device_lock);
6433                 return 0;
6434         }
6435
6436         for (i = 0; i < batch_size; i++)
6437                 handle_stripe(batch[i]);
6438         log_write_stripe_run(conf);
6439
6440         cond_resched();
6441
6442         spin_lock_irq(&conf->device_lock);
6443         for (i = 0; i < batch_size; i++) {
6444                 hash = batch[i]->hash_lock_index;
6445                 __release_stripe(conf, batch[i], &temp_inactive_list[hash]);
6446         }
6447         return batch_size;
6448 }
6449
6450 static void raid5_do_work(struct work_struct *work)
6451 {
6452         struct r5worker *worker = container_of(work, struct r5worker, work);
6453         struct r5worker_group *group = worker->group;
6454         struct r5conf *conf = group->conf;
6455         struct mddev *mddev = conf->mddev;
6456         int group_id = group - conf->worker_groups;
6457         int handled;
6458         struct blk_plug plug;
6459
6460         pr_debug("+++ raid5worker active\n");
6461
6462         blk_start_plug(&plug);
6463         handled = 0;
6464         spin_lock_irq(&conf->device_lock);
6465         while (1) {
6466                 int batch_size, released;
6467
6468                 released = release_stripe_list(conf, worker->temp_inactive_list);
6469
6470                 batch_size = handle_active_stripes(conf, group_id, worker,
6471                                                    worker->temp_inactive_list);
6472                 worker->working = false;
6473                 if (!batch_size && !released)
6474                         break;
6475                 handled += batch_size;
6476                 wait_event_lock_irq(mddev->sb_wait,
6477                         !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
6478                         conf->device_lock);
6479         }
6480         pr_debug("%d stripes handled\n", handled);
6481
6482         spin_unlock_irq(&conf->device_lock);
6483
6484         flush_deferred_bios(conf);
6485
6486         r5l_flush_stripe_to_raid(conf->log);
6487
6488         async_tx_issue_pending_all();
6489         blk_finish_plug(&plug);
6490
6491         pr_debug("--- raid5worker inactive\n");
6492 }
6493
6494 /*
6495  * This is our raid5 kernel thread.
6496  *
6497  * We scan the hash table for stripes which can be handled now.
6498  * During the scan, completed stripes are saved for us by the interrupt
6499  * handler, so that they will not have to wait for our next wakeup.
6500  */
6501 static void raid5d(struct md_thread *thread)
6502 {
6503         struct mddev *mddev = thread->mddev;
6504         struct r5conf *conf = mddev->private;
6505         int handled;
6506         struct blk_plug plug;
6507
6508         pr_debug("+++ raid5d active\n");
6509
6510         md_check_recovery(mddev);
6511
6512         blk_start_plug(&plug);
6513         handled = 0;
6514         spin_lock_irq(&conf->device_lock);
6515         while (1) {
6516                 struct bio *bio;
6517                 int batch_size, released;
6518                 unsigned int offset;
6519
6520                 released = release_stripe_list(conf, conf->temp_inactive_list);
6521                 if (released)
6522                         clear_bit(R5_DID_ALLOC, &conf->cache_state);
6523
6524                 if (
6525                     !list_empty(&conf->bitmap_list)) {
6526                         /* Now is a good time to flush some bitmap updates */
6527                         conf->seq_flush++;
6528                         spin_unlock_irq(&conf->device_lock);
6529                         md_bitmap_unplug(mddev->bitmap);
6530                         spin_lock_irq(&conf->device_lock);
6531                         conf->seq_write = conf->seq_flush;
6532                         activate_bit_delay(conf, conf->temp_inactive_list);
6533                 }
6534                 raid5_activate_delayed(conf);
6535
6536                 while ((bio = remove_bio_from_retry(conf, &offset))) {
6537                         int ok;
6538                         spin_unlock_irq(&conf->device_lock);
6539                         ok = retry_aligned_read(conf, bio, offset);
6540                         spin_lock_irq(&conf->device_lock);
6541                         if (!ok)
6542                                 break;
6543                         handled++;
6544                 }
6545
6546                 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
6547                                                    conf->temp_inactive_list);
6548                 if (!batch_size && !released)
6549                         break;
6550                 handled += batch_size;
6551
6552                 if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
6553                         spin_unlock_irq(&conf->device_lock);
6554                         md_check_recovery(mddev);
6555                         spin_lock_irq(&conf->device_lock);
6556                 }
6557         }
6558         pr_debug("%d stripes handled\n", handled);
6559
6560         spin_unlock_irq(&conf->device_lock);
6561         if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) &&
6562             mutex_trylock(&conf->cache_size_mutex)) {
6563                 grow_one_stripe(conf, __GFP_NOWARN);
6564                 /* Set flag even if allocation failed.  This helps
6565                  * slow down allocation requests when mem is short
6566                  */
6567                 set_bit(R5_DID_ALLOC, &conf->cache_state);
6568                 mutex_unlock(&conf->cache_size_mutex);
6569         }
6570
6571         flush_deferred_bios(conf);
6572
6573         r5l_flush_stripe_to_raid(conf->log);
6574
6575         async_tx_issue_pending_all();
6576         blk_finish_plug(&plug);
6577
6578         pr_debug("--- raid5d inactive\n");
6579 }
6580
6581 static ssize_t
6582 raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
6583 {
6584         struct r5conf *conf;
6585         int ret = 0;
6586         spin_lock(&mddev->lock);
6587         conf = mddev->private;
6588         if (conf)
6589                 ret = sprintf(page, "%d\n", conf->min_nr_stripes);
6590         spin_unlock(&mddev->lock);
6591         return ret;
6592 }
6593
6594 int
6595 raid5_set_cache_size(struct mddev *mddev, int size)
6596 {
6597         int result = 0;
6598         struct r5conf *conf = mddev->private;
6599
6600         if (size <= 16 || size > 32768)
6601                 return -EINVAL;
6602
6603         conf->min_nr_stripes = size;
6604         mutex_lock(&conf->cache_size_mutex);
6605         while (size < conf->max_nr_stripes &&
6606                drop_one_stripe(conf))
6607                 ;
6608         mutex_unlock(&conf->cache_size_mutex);
6609
6610         md_allow_write(mddev);
6611
6612         mutex_lock(&conf->cache_size_mutex);
6613         while (size > conf->max_nr_stripes)
6614                 if (!grow_one_stripe(conf, GFP_KERNEL)) {
6615                         conf->min_nr_stripes = conf->max_nr_stripes;
6616                         result = -ENOMEM;
6617                         break;
6618                 }
6619         mutex_unlock(&conf->cache_size_mutex);
6620
6621         return result;
6622 }
6623 EXPORT_SYMBOL(raid5_set_cache_size);
6624
6625 static ssize_t
6626 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
6627 {
6628         struct r5conf *conf;
6629         unsigned long new;
6630         int err;
6631
6632         if (len >= PAGE_SIZE)
6633                 return -EINVAL;
6634         if (kstrtoul(page, 10, &new))
6635                 return -EINVAL;
6636         err = mddev_lock(mddev);
6637         if (err)
6638                 return err;
6639         conf = mddev->private;
6640         if (!conf)
6641                 err = -ENODEV;
6642         else
6643                 err = raid5_set_cache_size(mddev, new);
6644         mddev_unlock(mddev);
6645
6646         return err ?: len;
6647 }
6648
6649 static struct md_sysfs_entry
6650 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
6651                                 raid5_show_stripe_cache_size,
6652                                 raid5_store_stripe_cache_size);
6653
6654 static ssize_t
6655 raid5_show_rmw_level(struct mddev  *mddev, char *page)
6656 {
6657         struct r5conf *conf = mddev->private;
6658         if (conf)
6659                 return sprintf(page, "%d\n", conf->rmw_level);
6660         else
6661                 return 0;
6662 }
6663
6664 static ssize_t
6665 raid5_store_rmw_level(struct mddev  *mddev, const char *page, size_t len)
6666 {
6667         struct r5conf *conf = mddev->private;
6668         unsigned long new;
6669
6670         if (!conf)
6671                 return -ENODEV;
6672
6673         if (len >= PAGE_SIZE)
6674                 return -EINVAL;
6675
6676         if (kstrtoul(page, 10, &new))
6677                 return -EINVAL;
6678
6679         if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
6680                 return -EINVAL;
6681
6682         if (new != PARITY_DISABLE_RMW &&
6683             new != PARITY_ENABLE_RMW &&
6684             new != PARITY_PREFER_RMW)
6685                 return -EINVAL;
6686
6687         conf->rmw_level = new;
6688         return len;
6689 }
6690
6691 static struct md_sysfs_entry
6692 raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
6693                          raid5_show_rmw_level,
6694                          raid5_store_rmw_level);
6695
6696 static ssize_t
6697 raid5_show_stripe_size(struct mddev  *mddev, char *page)
6698 {
6699         struct r5conf *conf;
6700         int ret = 0;
6701
6702         spin_lock(&mddev->lock);
6703         conf = mddev->private;
6704         if (conf)
6705                 ret = sprintf(page, "%lu\n", RAID5_STRIPE_SIZE(conf));
6706         spin_unlock(&mddev->lock);
6707         return ret;
6708 }
6709
6710 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
6711 static ssize_t
6712 raid5_store_stripe_size(struct mddev  *mddev, const char *page, size_t len)
6713 {
6714         struct r5conf *conf;
6715         unsigned long new;
6716         int err;
6717         int size;
6718
6719         if (len >= PAGE_SIZE)
6720                 return -EINVAL;
6721         if (kstrtoul(page, 10, &new))
6722                 return -EINVAL;
6723
6724         /*
6725          * The value should not be bigger than PAGE_SIZE. It requires to
6726          * be multiple of DEFAULT_STRIPE_SIZE and the value should be power
6727          * of two.
6728          */
6729         if (new % DEFAULT_STRIPE_SIZE != 0 ||
6730                         new > PAGE_SIZE || new == 0 ||
6731                         new != roundup_pow_of_two(new))
6732                 return -EINVAL;
6733
6734         err = mddev_lock(mddev);
6735         if (err)
6736                 return err;
6737
6738         conf = mddev->private;
6739         if (!conf) {
6740                 err = -ENODEV;
6741                 goto out_unlock;
6742         }
6743
6744         if (new == conf->stripe_size)
6745                 goto out_unlock;
6746
6747         pr_debug("md/raid: change stripe_size from %lu to %lu\n",
6748                         conf->stripe_size, new);
6749
6750         if (mddev->sync_thread ||
6751                 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
6752                 mddev->reshape_position != MaxSector ||
6753                 mddev->sysfs_active) {
6754                 err = -EBUSY;
6755                 goto out_unlock;
6756         }
6757
6758         mddev_suspend(mddev);
6759         mutex_lock(&conf->cache_size_mutex);
6760         size = conf->max_nr_stripes;
6761
6762         shrink_stripes(conf);
6763
6764         conf->stripe_size = new;
6765         conf->stripe_shift = ilog2(new) - 9;
6766         conf->stripe_sectors = new >> 9;
6767         if (grow_stripes(conf, size)) {
6768                 pr_warn("md/raid:%s: couldn't allocate buffers\n",
6769                                 mdname(mddev));
6770                 err = -ENOMEM;
6771         }
6772         mutex_unlock(&conf->cache_size_mutex);
6773         mddev_resume(mddev);
6774
6775 out_unlock:
6776         mddev_unlock(mddev);
6777         return err ?: len;
6778 }
6779
6780 static struct md_sysfs_entry
6781 raid5_stripe_size = __ATTR(stripe_size, 0644,
6782                          raid5_show_stripe_size,
6783                          raid5_store_stripe_size);
6784 #else
6785 static struct md_sysfs_entry
6786 raid5_stripe_size = __ATTR(stripe_size, 0444,
6787                          raid5_show_stripe_size,
6788                          NULL);
6789 #endif
6790
6791 static ssize_t
6792 raid5_show_preread_threshold(struct mddev *mddev, char *page)
6793 {
6794         struct r5conf *conf;
6795         int ret = 0;
6796         spin_lock(&mddev->lock);
6797         conf = mddev->private;
6798         if (conf)
6799                 ret = sprintf(page, "%d\n", conf->bypass_threshold);
6800         spin_unlock(&mddev->lock);
6801         return ret;
6802 }
6803
6804 static ssize_t
6805 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
6806 {
6807         struct r5conf *conf;
6808         unsigned long new;
6809         int err;
6810
6811         if (len >= PAGE_SIZE)
6812                 return -EINVAL;
6813         if (kstrtoul(page, 10, &new))
6814                 return -EINVAL;
6815
6816         err = mddev_lock(mddev);
6817         if (err)
6818                 return err;
6819         conf = mddev->private;
6820         if (!conf)
6821                 err = -ENODEV;
6822         else if (new > conf->min_nr_stripes)
6823                 err = -EINVAL;
6824         else
6825                 conf->bypass_threshold = new;
6826         mddev_unlock(mddev);
6827         return err ?: len;
6828 }
6829
6830 static struct md_sysfs_entry
6831 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
6832                                         S_IRUGO | S_IWUSR,
6833                                         raid5_show_preread_threshold,
6834                                         raid5_store_preread_threshold);
6835
6836 static ssize_t
6837 raid5_show_skip_copy(struct mddev *mddev, char *page)
6838 {
6839         struct r5conf *conf;
6840         int ret = 0;
6841         spin_lock(&mddev->lock);
6842         conf = mddev->private;
6843         if (conf)
6844                 ret = sprintf(page, "%d\n", conf->skip_copy);
6845         spin_unlock(&mddev->lock);
6846         return ret;
6847 }
6848
6849 static ssize_t
6850 raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
6851 {
6852         struct r5conf *conf;
6853         unsigned long new;
6854         int err;
6855
6856         if (len >= PAGE_SIZE)
6857                 return -EINVAL;
6858         if (kstrtoul(page, 10, &new))
6859                 return -EINVAL;
6860         new = !!new;
6861
6862         err = mddev_lock(mddev);
6863         if (err)
6864                 return err;
6865         conf = mddev->private;
6866         if (!conf)
6867                 err = -ENODEV;
6868         else if (new != conf->skip_copy) {
6869                 struct request_queue *q = mddev->queue;
6870
6871                 mddev_suspend(mddev);
6872                 conf->skip_copy = new;
6873                 if (new)
6874                         blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
6875                 else
6876                         blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
6877                 mddev_resume(mddev);
6878         }
6879         mddev_unlock(mddev);
6880         return err ?: len;
6881 }
6882
6883 static struct md_sysfs_entry
6884 raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR,
6885                                         raid5_show_skip_copy,
6886                                         raid5_store_skip_copy);
6887
6888 static ssize_t
6889 stripe_cache_active_show(struct mddev *mddev, char *page)
6890 {
6891         struct r5conf *conf = mddev->private;
6892         if (conf)
6893                 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
6894         else
6895                 return 0;
6896 }
6897
6898 static struct md_sysfs_entry
6899 raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
6900
6901 static ssize_t
6902 raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
6903 {
6904         struct r5conf *conf;
6905         int ret = 0;
6906         spin_lock(&mddev->lock);
6907         conf = mddev->private;
6908         if (conf)
6909                 ret = sprintf(page, "%d\n", conf->worker_cnt_per_group);
6910         spin_unlock(&mddev->lock);
6911         return ret;
6912 }
6913
6914 static int alloc_thread_groups(struct r5conf *conf, int cnt,
6915                                int *group_cnt,
6916                                struct r5worker_group **worker_groups);
6917 static ssize_t
6918 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
6919 {
6920         struct r5conf *conf;
6921         unsigned int new;
6922         int err;
6923         struct r5worker_group *new_groups, *old_groups;
6924         int group_cnt;
6925
6926         if (len >= PAGE_SIZE)
6927                 return -EINVAL;
6928         if (kstrtouint(page, 10, &new))
6929                 return -EINVAL;
6930         /* 8192 should be big enough */
6931         if (new > 8192)
6932                 return -EINVAL;
6933
6934         err = mddev_lock(mddev);
6935         if (err)
6936                 return err;
6937         conf = mddev->private;
6938         if (!conf)
6939                 err = -ENODEV;
6940         else if (new != conf->worker_cnt_per_group) {
6941                 mddev_suspend(mddev);
6942
6943                 old_groups = conf->worker_groups;
6944                 if (old_groups)
6945                         flush_workqueue(raid5_wq);
6946
6947                 err = alloc_thread_groups(conf, new, &group_cnt, &new_groups);
6948                 if (!err) {
6949                         spin_lock_irq(&conf->device_lock);
6950                         conf->group_cnt = group_cnt;
6951                         conf->worker_cnt_per_group = new;
6952                         conf->worker_groups = new_groups;
6953                         spin_unlock_irq(&conf->device_lock);
6954
6955                         if (old_groups)
6956                                 kfree(old_groups[0].workers);
6957                         kfree(old_groups);
6958                 }
6959                 mddev_resume(mddev);
6960         }
6961         mddev_unlock(mddev);
6962
6963         return err ?: len;
6964 }
6965
6966 static struct md_sysfs_entry
6967 raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR,
6968                                 raid5_show_group_thread_cnt,
6969                                 raid5_store_group_thread_cnt);
6970
6971 static struct attribute *raid5_attrs[] =  {
6972         &raid5_stripecache_size.attr,
6973         &raid5_stripecache_active.attr,
6974         &raid5_preread_bypass_threshold.attr,
6975         &raid5_group_thread_cnt.attr,
6976         &raid5_skip_copy.attr,
6977         &raid5_rmw_level.attr,
6978         &raid5_stripe_size.attr,
6979         &r5c_journal_mode.attr,
6980         &ppl_write_hint.attr,
6981         NULL,
6982 };
6983 static const struct attribute_group raid5_attrs_group = {
6984         .name = NULL,
6985         .attrs = raid5_attrs,
6986 };
6987
6988 static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt,
6989                                struct r5worker_group **worker_groups)
6990 {
6991         int i, j, k;
6992         ssize_t size;
6993         struct r5worker *workers;
6994
6995         if (cnt == 0) {
6996                 *group_cnt = 0;
6997                 *worker_groups = NULL;
6998                 return 0;
6999         }
7000         *group_cnt = num_possible_nodes();
7001         size = sizeof(struct r5worker) * cnt;
7002         workers = kcalloc(size, *group_cnt, GFP_NOIO);
7003         *worker_groups = kcalloc(*group_cnt, sizeof(struct r5worker_group),
7004                                  GFP_NOIO);
7005         if (!*worker_groups || !workers) {
7006                 kfree(workers);
7007                 kfree(*worker_groups);
7008                 return -ENOMEM;
7009         }
7010
7011         for (i = 0; i < *group_cnt; i++) {
7012                 struct r5worker_group *group;
7013
7014                 group = &(*worker_groups)[i];
7015                 INIT_LIST_HEAD(&group->handle_list);
7016                 INIT_LIST_HEAD(&group->loprio_list);
7017                 group->conf = conf;
7018                 group->workers = workers + i * cnt;
7019
7020                 for (j = 0; j < cnt; j++) {
7021                         struct r5worker *worker = group->workers + j;
7022                         worker->group = group;
7023                         INIT_WORK(&worker->work, raid5_do_work);
7024
7025                         for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
7026                                 INIT_LIST_HEAD(worker->temp_inactive_list + k);
7027                 }
7028         }
7029
7030         return 0;
7031 }
7032
7033 static void free_thread_groups(struct r5conf *conf)
7034 {
7035         if (conf->worker_groups)
7036                 kfree(conf->worker_groups[0].workers);
7037         kfree(conf->worker_groups);
7038         conf->worker_groups = NULL;
7039 }
7040
7041 static sector_t
7042 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
7043 {
7044         struct r5conf *conf = mddev->private;
7045
7046         if (!sectors)
7047                 sectors = mddev->dev_sectors;
7048         if (!raid_disks)
7049                 /* size is defined by the smallest of previous and new size */
7050                 raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
7051
7052         sectors &= ~((sector_t)conf->chunk_sectors - 1);
7053         sectors &= ~((sector_t)conf->prev_chunk_sectors - 1);
7054         return sectors * (raid_disks - conf->max_degraded);
7055 }
7056
7057 static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
7058 {
7059         safe_put_page(percpu->spare_page);
7060         percpu->spare_page = NULL;
7061         kvfree(percpu->scribble);
7062         percpu->scribble = NULL;
7063 }
7064
7065 static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
7066 {
7067         if (conf->level == 6 && !percpu->spare_page) {
7068                 percpu->spare_page = alloc_page(GFP_KERNEL);
7069                 if (!percpu->spare_page)
7070                         return -ENOMEM;
7071         }
7072
7073         if (scribble_alloc(percpu,
7074                            max(conf->raid_disks,
7075                                conf->previous_raid_disks),
7076                            max(conf->chunk_sectors,
7077                                conf->prev_chunk_sectors)
7078                            / RAID5_STRIPE_SECTORS(conf))) {
7079                 free_scratch_buffer(conf, percpu);
7080                 return -ENOMEM;
7081         }
7082
7083         local_lock_init(&percpu->lock);
7084         return 0;
7085 }
7086
7087 static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node)
7088 {
7089         struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
7090
7091         free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
7092         return 0;
7093 }
7094
7095 static void raid5_free_percpu(struct r5conf *conf)
7096 {
7097         if (!conf->percpu)
7098                 return;
7099
7100         cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
7101         free_percpu(conf->percpu);
7102 }
7103
7104 static void free_conf(struct r5conf *conf)
7105 {
7106         int i;
7107
7108         log_exit(conf);
7109
7110         unregister_shrinker(&conf->shrinker);
7111         free_thread_groups(conf);
7112         shrink_stripes(conf);
7113         raid5_free_percpu(conf);
7114         for (i = 0; i < conf->pool_size; i++)
7115                 if (conf->disks[i].extra_page)
7116                         put_page(conf->disks[i].extra_page);
7117         kfree(conf->disks);
7118         bioset_exit(&conf->bio_split);
7119         kfree(conf->stripe_hashtbl);
7120         kfree(conf->pending_data);
7121         kfree(conf);
7122 }
7123
7124 static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
7125 {
7126         struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
7127         struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
7128
7129         if (alloc_scratch_buffer(conf, percpu)) {
7130                 pr_warn("%s: failed memory allocation for cpu%u\n",
7131                         __func__, cpu);
7132                 return -ENOMEM;
7133         }
7134         return 0;
7135 }
7136
7137 static int raid5_alloc_percpu(struct r5conf *conf)
7138 {
7139         int err = 0;
7140
7141         conf->percpu = alloc_percpu(struct raid5_percpu);
7142         if (!conf->percpu)
7143                 return -ENOMEM;
7144
7145         err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
7146         if (!err) {
7147                 conf->scribble_disks = max(conf->raid_disks,
7148                         conf->previous_raid_disks);
7149                 conf->scribble_sectors = max(conf->chunk_sectors,
7150                         conf->prev_chunk_sectors);
7151         }
7152         return err;
7153 }
7154
7155 static unsigned long raid5_cache_scan(struct shrinker *shrink,
7156                                       struct shrink_control *sc)
7157 {
7158         struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
7159         unsigned long ret = SHRINK_STOP;
7160
7161         if (mutex_trylock(&conf->cache_size_mutex)) {
7162                 ret= 0;
7163                 while (ret < sc->nr_to_scan &&
7164                        conf->max_nr_stripes > conf->min_nr_stripes) {
7165                         if (drop_one_stripe(conf) == 0) {
7166                                 ret = SHRINK_STOP;
7167                                 break;
7168                         }
7169                         ret++;
7170                 }
7171                 mutex_unlock(&conf->cache_size_mutex);
7172         }
7173         return ret;
7174 }
7175
7176 static unsigned long raid5_cache_count(struct shrinker *shrink,
7177                                        struct shrink_control *sc)
7178 {
7179         struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
7180
7181         if (conf->max_nr_stripes < conf->min_nr_stripes)
7182                 /* unlikely, but not impossible */
7183                 return 0;
7184         return conf->max_nr_stripes - conf->min_nr_stripes;
7185 }
7186
7187 static struct r5conf *setup_conf(struct mddev *mddev)
7188 {
7189         struct r5conf *conf;
7190         int raid_disk, memory, max_disks;
7191         struct md_rdev *rdev;
7192         struct disk_info *disk;
7193         char pers_name[6];
7194         int i;
7195         int group_cnt;
7196         struct r5worker_group *new_group;
7197         int ret = -ENOMEM;
7198
7199         if (mddev->new_level != 5
7200             && mddev->new_level != 4
7201             && mddev->new_level != 6) {
7202                 pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n",
7203                         mdname(mddev), mddev->new_level);
7204                 return ERR_PTR(-EIO);
7205         }
7206         if ((mddev->new_level == 5
7207              && !algorithm_valid_raid5(mddev->new_layout)) ||
7208             (mddev->new_level == 6
7209              && !algorithm_valid_raid6(mddev->new_layout))) {
7210                 pr_warn("md/raid:%s: layout %d not supported\n",
7211                         mdname(mddev), mddev->new_layout);
7212                 return ERR_PTR(-EIO);
7213         }
7214         if (mddev->new_level == 6 && mddev->raid_disks < 4) {
7215                 pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n",
7216                         mdname(mddev), mddev->raid_disks);
7217                 return ERR_PTR(-EINVAL);
7218         }
7219
7220         if (!mddev->new_chunk_sectors ||
7221             (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
7222             !is_power_of_2(mddev->new_chunk_sectors)) {
7223                 pr_warn("md/raid:%s: invalid chunk size %d\n",
7224                         mdname(mddev), mddev->new_chunk_sectors << 9);
7225                 return ERR_PTR(-EINVAL);
7226         }
7227
7228         conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
7229         if (conf == NULL)
7230                 goto abort;
7231
7232 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
7233         conf->stripe_size = DEFAULT_STRIPE_SIZE;
7234         conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - 9;
7235         conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> 9;
7236 #endif
7237         INIT_LIST_HEAD(&conf->free_list);
7238         INIT_LIST_HEAD(&conf->pending_list);
7239         conf->pending_data = kcalloc(PENDING_IO_MAX,
7240                                      sizeof(struct r5pending_data),
7241                                      GFP_KERNEL);
7242         if (!conf->pending_data)
7243                 goto abort;
7244         for (i = 0; i < PENDING_IO_MAX; i++)
7245                 list_add(&conf->pending_data[i].sibling, &conf->free_list);
7246         /* Don't enable multi-threading by default*/
7247         if (!alloc_thread_groups(conf, 0, &group_cnt, &new_group)) {
7248                 conf->group_cnt = group_cnt;
7249                 conf->worker_cnt_per_group = 0;
7250                 conf->worker_groups = new_group;
7251         } else
7252                 goto abort;
7253         spin_lock_init(&conf->device_lock);
7254         seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock);
7255         mutex_init(&conf->cache_size_mutex);
7256
7257         init_waitqueue_head(&conf->wait_for_quiescent);
7258         init_waitqueue_head(&conf->wait_for_stripe);
7259         init_waitqueue_head(&conf->wait_for_overlap);
7260         INIT_LIST_HEAD(&conf->handle_list);
7261         INIT_LIST_HEAD(&conf->loprio_list);
7262         INIT_LIST_HEAD(&conf->hold_list);
7263         INIT_LIST_HEAD(&conf->delayed_list);
7264         INIT_LIST_HEAD(&conf->bitmap_list);
7265         init_llist_head(&conf->released_stripes);
7266         atomic_set(&conf->active_stripes, 0);
7267         atomic_set(&conf->preread_active_stripes, 0);
7268         atomic_set(&conf->active_aligned_reads, 0);
7269         spin_lock_init(&conf->pending_bios_lock);
7270         conf->batch_bio_dispatch = true;
7271         rdev_for_each(rdev, mddev) {
7272                 if (test_bit(Journal, &rdev->flags))
7273                         continue;
7274                 if (bdev_nonrot(rdev->bdev)) {
7275                         conf->batch_bio_dispatch = false;
7276                         break;
7277                 }
7278         }
7279
7280         conf->bypass_threshold = BYPASS_THRESHOLD;
7281         conf->recovery_disabled = mddev->recovery_disabled - 1;
7282
7283         conf->raid_disks = mddev->raid_disks;
7284         if (mddev->reshape_position == MaxSector)
7285                 conf->previous_raid_disks = mddev->raid_disks;
7286         else
7287                 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
7288         max_disks = max(conf->raid_disks, conf->previous_raid_disks);
7289
7290         conf->disks = kcalloc(max_disks, sizeof(struct disk_info),
7291                               GFP_KERNEL);
7292
7293         if (!conf->disks)
7294                 goto abort;
7295
7296         for (i = 0; i < max_disks; i++) {
7297                 conf->disks[i].extra_page = alloc_page(GFP_KERNEL);
7298                 if (!conf->disks[i].extra_page)
7299                         goto abort;
7300         }
7301
7302         ret = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
7303         if (ret)
7304                 goto abort;
7305         conf->mddev = mddev;
7306
7307         if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
7308                 goto abort;
7309
7310         /* We init hash_locks[0] separately to that it can be used
7311          * as the reference lock in the spin_lock_nest_lock() call
7312          * in lock_all_device_hash_locks_irq in order to convince
7313          * lockdep that we know what we are doing.
7314          */
7315         spin_lock_init(conf->hash_locks);
7316         for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
7317                 spin_lock_init(conf->hash_locks + i);
7318
7319         for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
7320                 INIT_LIST_HEAD(conf->inactive_list + i);
7321
7322         for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
7323                 INIT_LIST_HEAD(conf->temp_inactive_list + i);
7324
7325         atomic_set(&conf->r5c_cached_full_stripes, 0);
7326         INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
7327         atomic_set(&conf->r5c_cached_partial_stripes, 0);
7328         INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
7329         atomic_set(&conf->r5c_flushing_full_stripes, 0);
7330         atomic_set(&conf->r5c_flushing_partial_stripes, 0);
7331
7332         conf->level = mddev->new_level;
7333         conf->chunk_sectors = mddev->new_chunk_sectors;
7334         ret = raid5_alloc_percpu(conf);
7335         if (ret)
7336                 goto abort;
7337
7338         pr_debug("raid456: run(%s) called.\n", mdname(mddev));
7339
7340         ret = -EIO;
7341         rdev_for_each(rdev, mddev) {
7342                 raid_disk = rdev->raid_disk;
7343                 if (raid_disk >= max_disks
7344                     || raid_disk < 0 || test_bit(Journal, &rdev->flags))
7345                         continue;
7346                 disk = conf->disks + raid_disk;
7347
7348                 if (test_bit(Replacement, &rdev->flags)) {
7349                         if (disk->replacement)
7350                                 goto abort;
7351                         RCU_INIT_POINTER(disk->replacement, rdev);
7352                 } else {
7353                         if (disk->rdev)
7354                                 goto abort;
7355                         RCU_INIT_POINTER(disk->rdev, rdev);
7356                 }
7357
7358                 if (test_bit(In_sync, &rdev->flags)) {
7359                         pr_info("md/raid:%s: device %pg operational as raid disk %d\n",
7360                                 mdname(mddev), rdev->bdev, raid_disk);
7361                 } else if (rdev->saved_raid_disk != raid_disk)
7362                         /* Cannot rely on bitmap to complete recovery */
7363                         conf->fullsync = 1;
7364         }
7365
7366         conf->level = mddev->new_level;
7367         if (conf->level == 6) {
7368                 conf->max_degraded = 2;
7369                 if (raid6_call.xor_syndrome)
7370                         conf->rmw_level = PARITY_ENABLE_RMW;
7371                 else
7372                         conf->rmw_level = PARITY_DISABLE_RMW;
7373         } else {
7374                 conf->max_degraded = 1;
7375                 conf->rmw_level = PARITY_ENABLE_RMW;
7376         }
7377         conf->algorithm = mddev->new_layout;
7378         conf->reshape_progress = mddev->reshape_position;
7379         if (conf->reshape_progress != MaxSector) {
7380                 conf->prev_chunk_sectors = mddev->chunk_sectors;
7381                 conf->prev_algo = mddev->layout;
7382         } else {
7383                 conf->prev_chunk_sectors = conf->chunk_sectors;
7384                 conf->prev_algo = conf->algorithm;
7385         }
7386
7387         conf->min_nr_stripes = NR_STRIPES;
7388         if (mddev->reshape_position != MaxSector) {
7389                 int stripes = max_t(int,
7390                         ((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4,
7391                         ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4);
7392                 conf->min_nr_stripes = max(NR_STRIPES, stripes);
7393                 if (conf->min_nr_stripes != NR_STRIPES)
7394                         pr_info("md/raid:%s: force stripe size %d for reshape\n",
7395                                 mdname(mddev), conf->min_nr_stripes);
7396         }
7397         memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
7398                  max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
7399         atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
7400         if (grow_stripes(conf, conf->min_nr_stripes)) {
7401                 pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
7402                         mdname(mddev), memory);
7403                 ret = -ENOMEM;
7404                 goto abort;
7405         } else
7406                 pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
7407         /*
7408          * Losing a stripe head costs more than the time to refill it,
7409          * it reduces the queue depth and so can hurt throughput.
7410          * So set it rather large, scaled by number of devices.
7411          */
7412         conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
7413         conf->shrinker.scan_objects = raid5_cache_scan;
7414         conf->shrinker.count_objects = raid5_cache_count;
7415         conf->shrinker.batch = 128;
7416         conf->shrinker.flags = 0;
7417         ret = register_shrinker(&conf->shrinker);
7418         if (ret) {
7419                 pr_warn("md/raid:%s: couldn't register shrinker.\n",
7420                         mdname(mddev));
7421                 goto abort;
7422         }
7423
7424         sprintf(pers_name, "raid%d", mddev->new_level);
7425         conf->thread = md_register_thread(raid5d, mddev, pers_name);
7426         if (!conf->thread) {
7427                 pr_warn("md/raid:%s: couldn't allocate thread.\n",
7428                         mdname(mddev));
7429                 ret = -ENOMEM;
7430                 goto abort;
7431         }
7432
7433         return conf;
7434
7435  abort:
7436         if (conf)
7437                 free_conf(conf);
7438         return ERR_PTR(ret);
7439 }
7440
7441 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
7442 {
7443         switch (algo) {
7444         case ALGORITHM_PARITY_0:
7445                 if (raid_disk < max_degraded)
7446                         return 1;
7447                 break;
7448         case ALGORITHM_PARITY_N:
7449                 if (raid_disk >= raid_disks - max_degraded)
7450                         return 1;
7451                 break;
7452         case ALGORITHM_PARITY_0_6:
7453                 if (raid_disk == 0 ||
7454                     raid_disk == raid_disks - 1)
7455                         return 1;
7456                 break;
7457         case ALGORITHM_LEFT_ASYMMETRIC_6:
7458         case ALGORITHM_RIGHT_ASYMMETRIC_6:
7459         case ALGORITHM_LEFT_SYMMETRIC_6:
7460         case ALGORITHM_RIGHT_SYMMETRIC_6:
7461                 if (raid_disk == raid_disks - 1)
7462                         return 1;
7463         }
7464         return 0;
7465 }
7466
7467 static void raid5_set_io_opt(struct r5conf *conf)
7468 {
7469         blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) *
7470                          (conf->raid_disks - conf->max_degraded));
7471 }
7472
7473 static int raid5_run(struct mddev *mddev)
7474 {
7475         struct r5conf *conf;
7476         int working_disks = 0;
7477         int dirty_parity_disks = 0;
7478         struct md_rdev *rdev;
7479         struct md_rdev *journal_dev = NULL;
7480         sector_t reshape_offset = 0;
7481         int i, ret = 0;
7482         long long min_offset_diff = 0;
7483         int first = 1;
7484
7485         if (acct_bioset_init(mddev)) {
7486                 pr_err("md/raid456:%s: alloc acct bioset failed.\n", mdname(mddev));
7487                 return -ENOMEM;
7488         }
7489
7490         if (mddev_init_writes_pending(mddev) < 0) {
7491                 ret = -ENOMEM;
7492                 goto exit_acct_set;
7493         }
7494
7495         if (mddev->recovery_cp != MaxSector)
7496                 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
7497                           mdname(mddev));
7498
7499         rdev_for_each(rdev, mddev) {
7500                 long long diff;
7501
7502                 if (test_bit(Journal, &rdev->flags)) {
7503                         journal_dev = rdev;
7504                         continue;
7505                 }
7506                 if (rdev->raid_disk < 0)
7507                         continue;
7508                 diff = (rdev->new_data_offset - rdev->data_offset);
7509                 if (first) {
7510                         min_offset_diff = diff;
7511                         first = 0;
7512                 } else if (mddev->reshape_backwards &&
7513                          diff < min_offset_diff)
7514                         min_offset_diff = diff;
7515                 else if (!mddev->reshape_backwards &&
7516                          diff > min_offset_diff)
7517                         min_offset_diff = diff;
7518         }
7519
7520         if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) &&
7521             (mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
7522                 pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
7523                           mdname(mddev));
7524                 ret = -EINVAL;
7525                 goto exit_acct_set;
7526         }
7527
7528         if (mddev->reshape_position != MaxSector) {
7529                 /* Check that we can continue the reshape.
7530                  * Difficulties arise if the stripe we would write to
7531                  * next is at or after the stripe we would read from next.
7532                  * For a reshape that changes the number of devices, this
7533                  * is only possible for a very short time, and mdadm makes
7534                  * sure that time appears to have past before assembling
7535                  * the array.  So we fail if that time hasn't passed.
7536                  * For a reshape that keeps the number of devices the same
7537                  * mdadm must be monitoring the reshape can keeping the
7538                  * critical areas read-only and backed up.  It will start
7539                  * the array in read-only mode, so we check for that.
7540                  */
7541                 sector_t here_new, here_old;
7542                 int old_disks;
7543                 int max_degraded = (mddev->level == 6 ? 2 : 1);
7544                 int chunk_sectors;
7545                 int new_data_disks;
7546
7547                 if (journal_dev) {
7548                         pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
7549                                 mdname(mddev));
7550                         ret = -EINVAL;
7551                         goto exit_acct_set;
7552                 }
7553
7554                 if (mddev->new_level != mddev->level) {
7555                         pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
7556                                 mdname(mddev));
7557                         ret = -EINVAL;
7558                         goto exit_acct_set;
7559                 }
7560                 old_disks = mddev->raid_disks - mddev->delta_disks;
7561                 /* reshape_position must be on a new-stripe boundary, and one
7562                  * further up in new geometry must map after here in old
7563                  * geometry.
7564                  * If the chunk sizes are different, then as we perform reshape
7565                  * in units of the largest of the two, reshape_position needs
7566                  * be a multiple of the largest chunk size times new data disks.
7567                  */
7568                 here_new = mddev->reshape_position;
7569                 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
7570                 new_data_disks = mddev->raid_disks - max_degraded;
7571                 if (sector_div(here_new, chunk_sectors * new_data_disks)) {
7572                         pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
7573                                 mdname(mddev));
7574                         ret = -EINVAL;
7575                         goto exit_acct_set;
7576                 }
7577                 reshape_offset = here_new * chunk_sectors;
7578                 /* here_new is the stripe we will write to */
7579                 here_old = mddev->reshape_position;
7580                 sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
7581                 /* here_old is the first stripe that we might need to read
7582                  * from */
7583                 if (mddev->delta_disks == 0) {
7584                         /* We cannot be sure it is safe to start an in-place
7585                          * reshape.  It is only safe if user-space is monitoring
7586                          * and taking constant backups.
7587                          * mdadm always starts a situation like this in
7588                          * readonly mode so it can take control before
7589                          * allowing any writes.  So just check for that.
7590                          */
7591                         if (abs(min_offset_diff) >= mddev->chunk_sectors &&
7592                             abs(min_offset_diff) >= mddev->new_chunk_sectors)
7593                                 /* not really in-place - so OK */;
7594                         else if (mddev->ro == 0) {
7595                                 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
7596                                         mdname(mddev));
7597                                 ret = -EINVAL;
7598                                 goto exit_acct_set;
7599                         }
7600                 } else if (mddev->reshape_backwards
7601                     ? (here_new * chunk_sectors + min_offset_diff <=
7602                        here_old * chunk_sectors)
7603                     : (here_new * chunk_sectors >=
7604                        here_old * chunk_sectors + (-min_offset_diff))) {
7605                         /* Reading from the same stripe as writing to - bad */
7606                         pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
7607                                 mdname(mddev));
7608                         ret = -EINVAL;
7609                         goto exit_acct_set;
7610                 }
7611                 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
7612                 /* OK, we should be able to continue; */
7613         } else {
7614                 BUG_ON(mddev->level != mddev->new_level);
7615                 BUG_ON(mddev->layout != mddev->new_layout);
7616                 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
7617                 BUG_ON(mddev->delta_disks != 0);
7618         }
7619
7620         if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
7621             test_bit(MD_HAS_PPL, &mddev->flags)) {
7622                 pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
7623                         mdname(mddev));
7624                 clear_bit(MD_HAS_PPL, &mddev->flags);
7625                 clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags);
7626         }
7627
7628         if (mddev->private == NULL)
7629                 conf = setup_conf(mddev);
7630         else
7631                 conf = mddev->private;
7632
7633         if (IS_ERR(conf)) {
7634                 ret = PTR_ERR(conf);
7635                 goto exit_acct_set;
7636         }
7637
7638         if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
7639                 if (!journal_dev) {
7640                         pr_warn("md/raid:%s: journal disk is missing, force array readonly\n",
7641                                 mdname(mddev));
7642                         mddev->ro = 1;
7643                         set_disk_ro(mddev->gendisk, 1);
7644                 } else if (mddev->recovery_cp == MaxSector)
7645                         set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
7646         }
7647
7648         conf->min_offset_diff = min_offset_diff;
7649         mddev->thread = conf->thread;
7650         conf->thread = NULL;
7651         mddev->private = conf;
7652
7653         for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
7654              i++) {
7655                 rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev);
7656                 if (!rdev && conf->disks[i].replacement) {
7657                         /* The replacement is all we have yet */
7658                         rdev = rdev_mdlock_deref(mddev,
7659                                                  conf->disks[i].replacement);
7660                         conf->disks[i].replacement = NULL;
7661                         clear_bit(Replacement, &rdev->flags);
7662                         rcu_assign_pointer(conf->disks[i].rdev, rdev);
7663                 }
7664                 if (!rdev)
7665                         continue;
7666                 if (rcu_access_pointer(conf->disks[i].replacement) &&
7667                     conf->reshape_progress != MaxSector) {
7668                         /* replacements and reshape simply do not mix. */
7669                         pr_warn("md: cannot handle concurrent replacement and reshape.\n");
7670                         goto abort;
7671                 }
7672                 if (test_bit(In_sync, &rdev->flags)) {
7673                         working_disks++;
7674                         continue;
7675                 }
7676                 /* This disc is not fully in-sync.  However if it
7677                  * just stored parity (beyond the recovery_offset),
7678                  * when we don't need to be concerned about the
7679                  * array being dirty.
7680                  * When reshape goes 'backwards', we never have
7681                  * partially completed devices, so we only need
7682                  * to worry about reshape going forwards.
7683                  */
7684                 /* Hack because v0.91 doesn't store recovery_offset properly. */
7685                 if (mddev->major_version == 0 &&
7686                     mddev->minor_version > 90)
7687                         rdev->recovery_offset = reshape_offset;
7688
7689                 if (rdev->recovery_offset < reshape_offset) {
7690                         /* We need to check old and new layout */
7691                         if (!only_parity(rdev->raid_disk,
7692                                          conf->algorithm,
7693                                          conf->raid_disks,
7694                                          conf->max_degraded))
7695                                 continue;
7696                 }
7697                 if (!only_parity(rdev->raid_disk,
7698                                  conf->prev_algo,
7699                                  conf->previous_raid_disks,
7700                                  conf->max_degraded))
7701                         continue;
7702                 dirty_parity_disks++;
7703         }
7704
7705         /*
7706          * 0 for a fully functional array, 1 or 2 for a degraded array.
7707          */
7708         mddev->degraded = raid5_calc_degraded(conf);
7709
7710         if (has_failed(conf)) {
7711                 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
7712                         mdname(mddev), mddev->degraded, conf->raid_disks);
7713                 goto abort;
7714         }
7715
7716         /* device size must be a multiple of chunk size */
7717         mddev->dev_sectors &= ~((sector_t)mddev->chunk_sectors - 1);
7718         mddev->resync_max_sectors = mddev->dev_sectors;
7719
7720         if (mddev->degraded > dirty_parity_disks &&
7721             mddev->recovery_cp != MaxSector) {
7722                 if (test_bit(MD_HAS_PPL, &mddev->flags))
7723                         pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
7724                                 mdname(mddev));
7725                 else if (mddev->ok_start_degraded)
7726                         pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
7727                                 mdname(mddev));
7728                 else {
7729                         pr_crit("md/raid:%s: cannot start dirty degraded array.\n",
7730                                 mdname(mddev));
7731                         goto abort;
7732                 }
7733         }
7734
7735         pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n",
7736                 mdname(mddev), conf->level,
7737                 mddev->raid_disks-mddev->degraded, mddev->raid_disks,
7738                 mddev->new_layout);
7739
7740         print_raid5_conf(conf);
7741
7742         if (conf->reshape_progress != MaxSector) {
7743                 conf->reshape_safe = conf->reshape_progress;
7744                 atomic_set(&conf->reshape_stripes, 0);
7745                 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7746                 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7747                 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7748                 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7749                 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
7750                                                         "reshape");
7751                 if (!mddev->sync_thread)
7752                         goto abort;
7753         }
7754
7755         /* Ok, everything is just fine now */
7756         if (mddev->to_remove == &raid5_attrs_group)
7757                 mddev->to_remove = NULL;
7758         else if (mddev->kobj.sd &&
7759             sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
7760                 pr_warn("raid5: failed to create sysfs attributes for %s\n",
7761                         mdname(mddev));
7762         md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
7763
7764         if (mddev->queue) {
7765                 int chunk_size;
7766                 /* read-ahead size must cover two whole stripes, which
7767                  * is 2 * (datadisks) * chunksize where 'n' is the
7768                  * number of raid devices
7769                  */
7770                 int data_disks = conf->previous_raid_disks - conf->max_degraded;
7771                 int stripe = data_disks *
7772                         ((mddev->chunk_sectors << 9) / PAGE_SIZE);
7773
7774                 chunk_size = mddev->chunk_sectors << 9;
7775                 blk_queue_io_min(mddev->queue, chunk_size);
7776                 raid5_set_io_opt(conf);
7777                 mddev->queue->limits.raid_partial_stripes_expensive = 1;
7778                 /*
7779                  * We can only discard a whole stripe. It doesn't make sense to
7780                  * discard data disk but write parity disk
7781                  */
7782                 stripe = stripe * PAGE_SIZE;
7783                 stripe = roundup_pow_of_two(stripe);
7784                 mddev->queue->limits.discard_granularity = stripe;
7785
7786                 blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
7787
7788                 rdev_for_each(rdev, mddev) {
7789                         disk_stack_limits(mddev->gendisk, rdev->bdev,
7790                                           rdev->data_offset << 9);
7791                         disk_stack_limits(mddev->gendisk, rdev->bdev,
7792                                           rdev->new_data_offset << 9);
7793                 }
7794
7795                 /*
7796                  * zeroing is required, otherwise data
7797                  * could be lost. Consider a scenario: discard a stripe
7798                  * (the stripe could be inconsistent if
7799                  * discard_zeroes_data is 0); write one disk of the
7800                  * stripe (the stripe could be inconsistent again
7801                  * depending on which disks are used to calculate
7802                  * parity); the disk is broken; The stripe data of this
7803                  * disk is lost.
7804                  *
7805                  * We only allow DISCARD if the sysadmin has confirmed that
7806                  * only safe devices are in use by setting a module parameter.
7807                  * A better idea might be to turn DISCARD into WRITE_ZEROES
7808                  * requests, as that is required to be safe.
7809                  */
7810                 if (!devices_handle_discard_safely ||
7811                     mddev->queue->limits.max_discard_sectors < (stripe >> 9) ||
7812                     mddev->queue->limits.discard_granularity < stripe)
7813                         blk_queue_max_discard_sectors(mddev->queue, 0);
7814
7815                 blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
7816         }
7817
7818         if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
7819                 goto abort;
7820
7821         return 0;
7822 abort:
7823         md_unregister_thread(&mddev->thread);
7824         print_raid5_conf(conf);
7825         free_conf(conf);
7826         mddev->private = NULL;
7827         pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
7828         ret = -EIO;
7829 exit_acct_set:
7830         acct_bioset_exit(mddev);
7831         return ret;
7832 }
7833
7834 static void raid5_free(struct mddev *mddev, void *priv)
7835 {
7836         struct r5conf *conf = priv;
7837
7838         free_conf(conf);
7839         acct_bioset_exit(mddev);
7840         mddev->to_remove = &raid5_attrs_group;
7841 }
7842
7843 static void raid5_status(struct seq_file *seq, struct mddev *mddev)
7844 {
7845         struct r5conf *conf = mddev->private;
7846         int i;
7847
7848         seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
7849                 conf->chunk_sectors / 2, mddev->layout);
7850         seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
7851         rcu_read_lock();
7852         for (i = 0; i < conf->raid_disks; i++) {
7853                 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
7854                 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
7855         }
7856         rcu_read_unlock();
7857         seq_printf (seq, "]");
7858 }
7859
7860 static void print_raid5_conf (struct r5conf *conf)
7861 {
7862         struct md_rdev *rdev;
7863         int i;
7864
7865         pr_debug("RAID conf printout:\n");
7866         if (!conf) {
7867                 pr_debug("(conf==NULL)\n");
7868                 return;
7869         }
7870         pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level,
7871                conf->raid_disks,
7872                conf->raid_disks - conf->mddev->degraded);
7873
7874         rcu_read_lock();
7875         for (i = 0; i < conf->raid_disks; i++) {
7876                 rdev = rcu_dereference(conf->disks[i].rdev);
7877                 if (rdev)
7878                         pr_debug(" disk %d, o:%d, dev:%pg\n",
7879                                i, !test_bit(Faulty, &rdev->flags),
7880                                rdev->bdev);
7881         }
7882         rcu_read_unlock();
7883 }
7884
7885 static int raid5_spare_active(struct mddev *mddev)
7886 {
7887         int i;
7888         struct r5conf *conf = mddev->private;
7889         struct md_rdev *rdev, *replacement;
7890         int count = 0;
7891         unsigned long flags;
7892
7893         for (i = 0; i < conf->raid_disks; i++) {
7894                 rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev);
7895                 replacement = rdev_mdlock_deref(mddev,
7896                                                 conf->disks[i].replacement);
7897                 if (replacement
7898                     && replacement->recovery_offset == MaxSector
7899                     && !test_bit(Faulty, &replacement->flags)
7900                     && !test_and_set_bit(In_sync, &replacement->flags)) {
7901                         /* Replacement has just become active. */
7902                         if (!rdev
7903                             || !test_and_clear_bit(In_sync, &rdev->flags))
7904                                 count++;
7905                         if (rdev) {
7906                                 /* Replaced device not technically faulty,
7907                                  * but we need to be sure it gets removed
7908                                  * and never re-added.
7909                                  */
7910                                 set_bit(Faulty, &rdev->flags);
7911                                 sysfs_notify_dirent_safe(
7912                                         rdev->sysfs_state);
7913                         }
7914                         sysfs_notify_dirent_safe(replacement->sysfs_state);
7915                 } else if (rdev
7916                     && rdev->recovery_offset == MaxSector
7917                     && !test_bit(Faulty, &rdev->flags)
7918                     && !test_and_set_bit(In_sync, &rdev->flags)) {
7919                         count++;
7920                         sysfs_notify_dirent_safe(rdev->sysfs_state);
7921                 }
7922         }
7923         spin_lock_irqsave(&conf->device_lock, flags);
7924         mddev->degraded = raid5_calc_degraded(conf);
7925         spin_unlock_irqrestore(&conf->device_lock, flags);
7926         print_raid5_conf(conf);
7927         return count;
7928 }
7929
7930 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
7931 {
7932         struct r5conf *conf = mddev->private;
7933         int err = 0;
7934         int number = rdev->raid_disk;
7935         struct md_rdev __rcu **rdevp;
7936         struct disk_info *p;
7937         struct md_rdev *tmp;
7938
7939         print_raid5_conf(conf);
7940         if (test_bit(Journal, &rdev->flags) && conf->log) {
7941                 /*
7942                  * we can't wait pending write here, as this is called in
7943                  * raid5d, wait will deadlock.
7944                  * neilb: there is no locking about new writes here,
7945                  * so this cannot be safe.
7946                  */
7947                 if (atomic_read(&conf->active_stripes) ||
7948                     atomic_read(&conf->r5c_cached_full_stripes) ||
7949                     atomic_read(&conf->r5c_cached_partial_stripes)) {
7950                         return -EBUSY;
7951                 }
7952                 log_exit(conf);
7953                 return 0;
7954         }
7955         if (unlikely(number >= conf->pool_size))
7956                 return 0;
7957         p = conf->disks + number;
7958         if (rdev == rcu_access_pointer(p->rdev))
7959                 rdevp = &p->rdev;
7960         else if (rdev == rcu_access_pointer(p->replacement))
7961                 rdevp = &p->replacement;
7962         else
7963                 return 0;
7964
7965         if (number >= conf->raid_disks &&
7966             conf->reshape_progress == MaxSector)
7967                 clear_bit(In_sync, &rdev->flags);
7968
7969         if (test_bit(In_sync, &rdev->flags) ||
7970             atomic_read(&rdev->nr_pending)) {
7971                 err = -EBUSY;
7972                 goto abort;
7973         }
7974         /* Only remove non-faulty devices if recovery
7975          * isn't possible.
7976          */
7977         if (!test_bit(Faulty, &rdev->flags) &&
7978             mddev->recovery_disabled != conf->recovery_disabled &&
7979             !has_failed(conf) &&
7980             (!rcu_access_pointer(p->replacement) ||
7981              rcu_access_pointer(p->replacement) == rdev) &&
7982             number < conf->raid_disks) {
7983                 err = -EBUSY;
7984                 goto abort;
7985         }
7986         *rdevp = NULL;
7987         if (!test_bit(RemoveSynchronized, &rdev->flags)) {
7988                 lockdep_assert_held(&mddev->reconfig_mutex);
7989                 synchronize_rcu();
7990                 if (atomic_read(&rdev->nr_pending)) {
7991                         /* lost the race, try later */
7992                         err = -EBUSY;
7993                         rcu_assign_pointer(*rdevp, rdev);
7994                 }
7995         }
7996         if (!err) {
7997                 err = log_modify(conf, rdev, false);
7998                 if (err)
7999                         goto abort;
8000         }
8001
8002         tmp = rcu_access_pointer(p->replacement);
8003         if (tmp) {
8004                 /* We must have just cleared 'rdev' */
8005                 rcu_assign_pointer(p->rdev, tmp);
8006                 clear_bit(Replacement, &tmp->flags);
8007                 smp_mb(); /* Make sure other CPUs may see both as identical
8008                            * but will never see neither - if they are careful
8009                            */
8010                 rcu_assign_pointer(p->replacement, NULL);
8011
8012                 if (!err)
8013                         err = log_modify(conf, tmp, true);
8014         }
8015
8016         clear_bit(WantReplacement, &rdev->flags);
8017 abort:
8018
8019         print_raid5_conf(conf);
8020         return err;
8021 }
8022
8023 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
8024 {
8025         struct r5conf *conf = mddev->private;
8026         int ret, err = -EEXIST;
8027         int disk;
8028         struct disk_info *p;
8029         struct md_rdev *tmp;
8030         int first = 0;
8031         int last = conf->raid_disks - 1;
8032
8033         if (test_bit(Journal, &rdev->flags)) {
8034                 if (conf->log)
8035                         return -EBUSY;
8036
8037                 rdev->raid_disk = 0;
8038                 /*
8039                  * The array is in readonly mode if journal is missing, so no
8040                  * write requests running. We should be safe
8041                  */
8042                 ret = log_init(conf, rdev, false);
8043                 if (ret)
8044                         return ret;
8045
8046                 ret = r5l_start(conf->log);
8047                 if (ret)
8048                         return ret;
8049
8050                 return 0;
8051         }
8052         if (mddev->recovery_disabled == conf->recovery_disabled)
8053                 return -EBUSY;
8054
8055         if (rdev->saved_raid_disk < 0 && has_failed(conf))
8056                 /* no point adding a device */
8057                 return -EINVAL;
8058
8059         if (rdev->raid_disk >= 0)
8060                 first = last = rdev->raid_disk;
8061
8062         /*
8063          * find the disk ... but prefer rdev->saved_raid_disk
8064          * if possible.
8065          */
8066         if (rdev->saved_raid_disk >= 0 &&
8067             rdev->saved_raid_disk >= first &&
8068             rdev->saved_raid_disk <= last &&
8069             conf->disks[rdev->saved_raid_disk].rdev == NULL)
8070                 first = rdev->saved_raid_disk;
8071
8072         for (disk = first; disk <= last; disk++) {
8073                 p = conf->disks + disk;
8074                 if (p->rdev == NULL) {
8075                         clear_bit(In_sync, &rdev->flags);
8076                         rdev->raid_disk = disk;
8077                         if (rdev->saved_raid_disk != disk)
8078                                 conf->fullsync = 1;
8079                         rcu_assign_pointer(p->rdev, rdev);
8080
8081                         err = log_modify(conf, rdev, true);
8082
8083                         goto out;
8084                 }
8085         }
8086         for (disk = first; disk <= last; disk++) {
8087                 p = conf->disks + disk;
8088                 tmp = rdev_mdlock_deref(mddev, p->rdev);
8089                 if (test_bit(WantReplacement, &tmp->flags) &&
8090                     p->replacement == NULL) {
8091                         clear_bit(In_sync, &rdev->flags);
8092                         set_bit(Replacement, &rdev->flags);
8093                         rdev->raid_disk = disk;
8094                         err = 0;
8095                         conf->fullsync = 1;
8096                         rcu_assign_pointer(p->replacement, rdev);
8097                         break;
8098                 }
8099         }
8100 out:
8101         print_raid5_conf(conf);
8102         return err;
8103 }
8104
8105 static int raid5_resize(struct mddev *mddev, sector_t sectors)
8106 {
8107         /* no resync is happening, and there is enough space
8108          * on all devices, so we can resize.
8109          * We need to make sure resync covers any new space.
8110          * If the array is shrinking we should possibly wait until
8111          * any io in the removed space completes, but it hardly seems
8112          * worth it.
8113          */
8114         sector_t newsize;
8115         struct r5conf *conf = mddev->private;
8116
8117         if (raid5_has_log(conf) || raid5_has_ppl(conf))
8118                 return -EINVAL;
8119         sectors &= ~((sector_t)conf->chunk_sectors - 1);
8120         newsize = raid5_size(mddev, sectors, mddev->raid_disks);
8121         if (mddev->external_size &&
8122             mddev->array_sectors > newsize)
8123                 return -EINVAL;
8124         if (mddev->bitmap) {
8125                 int ret = md_bitmap_resize(mddev->bitmap, sectors, 0, 0);
8126                 if (ret)
8127                         return ret;
8128         }
8129         md_set_array_sectors(mddev, newsize);
8130         if (sectors > mddev->dev_sectors &&
8131             mddev->recovery_cp > mddev->dev_sectors) {
8132                 mddev->recovery_cp = mddev->dev_sectors;
8133                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8134         }
8135         mddev->dev_sectors = sectors;
8136         mddev->resync_max_sectors = sectors;
8137         return 0;
8138 }
8139
8140 static int check_stripe_cache(struct mddev *mddev)
8141 {
8142         /* Can only proceed if there are plenty of stripe_heads.
8143          * We need a minimum of one full stripe,, and for sensible progress
8144          * it is best to have about 4 times that.
8145          * If we require 4 times, then the default 256 4K stripe_heads will
8146          * allow for chunk sizes up to 256K, which is probably OK.
8147          * If the chunk size is greater, user-space should request more
8148          * stripe_heads first.
8149          */
8150         struct r5conf *conf = mddev->private;
8151         if (((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
8152             > conf->min_nr_stripes ||
8153             ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
8154             > conf->min_nr_stripes) {
8155                 pr_warn("md/raid:%s: reshape: not enough stripes.  Needed %lu\n",
8156                         mdname(mddev),
8157                         ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
8158                          / RAID5_STRIPE_SIZE(conf))*4);
8159                 return 0;
8160         }
8161         return 1;
8162 }
8163
8164 static int check_reshape(struct mddev *mddev)
8165 {
8166         struct r5conf *conf = mddev->private;
8167
8168         if (raid5_has_log(conf) || raid5_has_ppl(conf))
8169                 return -EINVAL;
8170         if (mddev->delta_disks == 0 &&
8171             mddev->new_layout == mddev->layout &&
8172             mddev->new_chunk_sectors == mddev->chunk_sectors)
8173                 return 0; /* nothing to do */
8174         if (has_failed(conf))
8175                 return -EINVAL;
8176         if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) {
8177                 /* We might be able to shrink, but the devices must
8178                  * be made bigger first.
8179                  * For raid6, 4 is the minimum size.
8180                  * Otherwise 2 is the minimum
8181                  */
8182                 int min = 2;
8183                 if (mddev->level == 6)
8184                         min = 4;
8185                 if (mddev->raid_disks + mddev->delta_disks < min)
8186                         return -EINVAL;
8187         }
8188
8189         if (!check_stripe_cache(mddev))
8190                 return -ENOSPC;
8191
8192         if (mddev->new_chunk_sectors > mddev->chunk_sectors ||
8193             mddev->delta_disks > 0)
8194                 if (resize_chunks(conf,
8195                                   conf->previous_raid_disks
8196                                   + max(0, mddev->delta_disks),
8197                                   max(mddev->new_chunk_sectors,
8198                                       mddev->chunk_sectors)
8199                             ) < 0)
8200                         return -ENOMEM;
8201
8202         if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size)
8203                 return 0; /* never bother to shrink */
8204         return resize_stripes(conf, (conf->previous_raid_disks
8205                                      + mddev->delta_disks));
8206 }
8207
8208 static int raid5_start_reshape(struct mddev *mddev)
8209 {
8210         struct r5conf *conf = mddev->private;
8211         struct md_rdev *rdev;
8212         int spares = 0;
8213         unsigned long flags;
8214
8215         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
8216                 return -EBUSY;
8217
8218         if (!check_stripe_cache(mddev))
8219                 return -ENOSPC;
8220
8221         if (has_failed(conf))
8222                 return -EINVAL;
8223
8224         rdev_for_each(rdev, mddev) {
8225                 if (!test_bit(In_sync, &rdev->flags)
8226                     && !test_bit(Faulty, &rdev->flags))
8227                         spares++;
8228         }
8229
8230         if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
8231                 /* Not enough devices even to make a degraded array
8232                  * of that size
8233                  */
8234                 return -EINVAL;
8235
8236         /* Refuse to reduce size of the array.  Any reductions in
8237          * array size must be through explicit setting of array_size
8238          * attribute.
8239          */
8240         if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
8241             < mddev->array_sectors) {
8242                 pr_warn("md/raid:%s: array size must be reduced before number of disks\n",
8243                         mdname(mddev));
8244                 return -EINVAL;
8245         }
8246
8247         atomic_set(&conf->reshape_stripes, 0);
8248         spin_lock_irq(&conf->device_lock);
8249         write_seqcount_begin(&conf->gen_lock);
8250         conf->previous_raid_disks = conf->raid_disks;
8251         conf->raid_disks += mddev->delta_disks;
8252         conf->prev_chunk_sectors = conf->chunk_sectors;
8253         conf->chunk_sectors = mddev->new_chunk_sectors;
8254         conf->prev_algo = conf->algorithm;
8255         conf->algorithm = mddev->new_layout;
8256         conf->generation++;
8257         /* Code that selects data_offset needs to see the generation update
8258          * if reshape_progress has been set - so a memory barrier needed.
8259          */
8260         smp_mb();
8261         if (mddev->reshape_backwards)
8262                 conf->reshape_progress = raid5_size(mddev, 0, 0);
8263         else
8264                 conf->reshape_progress = 0;
8265         conf->reshape_safe = conf->reshape_progress;
8266         write_seqcount_end(&conf->gen_lock);
8267         spin_unlock_irq(&conf->device_lock);
8268
8269         /* Now make sure any requests that proceeded on the assumption
8270          * the reshape wasn't running - like Discard or Read - have
8271          * completed.
8272          */
8273         mddev_suspend(mddev);
8274         mddev_resume(mddev);
8275
8276         /* Add some new drives, as many as will fit.
8277          * We know there are enough to make the newly sized array work.
8278          * Don't add devices if we are reducing the number of
8279          * devices in the array.  This is because it is not possible
8280          * to correctly record the "partially reconstructed" state of
8281          * such devices during the reshape and confusion could result.
8282          */
8283         if (mddev->delta_disks >= 0) {
8284                 rdev_for_each(rdev, mddev)
8285                         if (rdev->raid_disk < 0 &&
8286                             !test_bit(Faulty, &rdev->flags)) {
8287                                 if (raid5_add_disk(mddev, rdev) == 0) {
8288                                         if (rdev->raid_disk
8289                                             >= conf->previous_raid_disks)
8290                                                 set_bit(In_sync, &rdev->flags);
8291                                         else
8292                                                 rdev->recovery_offset = 0;
8293
8294                                         /* Failure here is OK */
8295                                         sysfs_link_rdev(mddev, rdev);
8296                                 }
8297                         } else if (rdev->raid_disk >= conf->previous_raid_disks
8298                                    && !test_bit(Faulty, &rdev->flags)) {
8299                                 /* This is a spare that was manually added */
8300                                 set_bit(In_sync, &rdev->flags);
8301                         }
8302
8303                 /* When a reshape changes the number of devices,
8304                  * ->degraded is measured against the larger of the
8305                  * pre and post number of devices.
8306                  */
8307                 spin_lock_irqsave(&conf->device_lock, flags);
8308                 mddev->degraded = raid5_calc_degraded(conf);
8309                 spin_unlock_irqrestore(&conf->device_lock, flags);
8310         }
8311         mddev->raid_disks = conf->raid_disks;
8312         mddev->reshape_position = conf->reshape_progress;
8313         set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8314
8315         clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8316         clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8317         clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
8318         set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8319         set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8320         mddev->sync_thread = md_register_thread(md_do_sync, mddev,
8321                                                 "reshape");
8322         if (!mddev->sync_thread) {
8323                 mddev->recovery = 0;
8324                 spin_lock_irq(&conf->device_lock);
8325                 write_seqcount_begin(&conf->gen_lock);
8326                 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
8327                 mddev->new_chunk_sectors =
8328                         conf->chunk_sectors = conf->prev_chunk_sectors;
8329                 mddev->new_layout = conf->algorithm = conf->prev_algo;
8330                 rdev_for_each(rdev, mddev)
8331                         rdev->new_data_offset = rdev->data_offset;
8332                 smp_wmb();
8333                 conf->generation --;
8334                 conf->reshape_progress = MaxSector;
8335                 mddev->reshape_position = MaxSector;
8336                 write_seqcount_end(&conf->gen_lock);
8337                 spin_unlock_irq(&conf->device_lock);
8338                 return -EAGAIN;
8339         }
8340         conf->reshape_checkpoint = jiffies;
8341         md_wakeup_thread(mddev->sync_thread);
8342         md_new_event();
8343         return 0;
8344 }
8345
8346 /* This is called from the reshape thread and should make any
8347  * changes needed in 'conf'
8348  */
8349 static void end_reshape(struct r5conf *conf)
8350 {
8351
8352         if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
8353                 struct md_rdev *rdev;
8354
8355                 spin_lock_irq(&conf->device_lock);
8356                 conf->previous_raid_disks = conf->raid_disks;
8357                 md_finish_reshape(conf->mddev);
8358                 smp_wmb();
8359                 conf->reshape_progress = MaxSector;
8360                 conf->mddev->reshape_position = MaxSector;
8361                 rdev_for_each(rdev, conf->mddev)
8362                         if (rdev->raid_disk >= 0 &&
8363                             !test_bit(Journal, &rdev->flags) &&
8364                             !test_bit(In_sync, &rdev->flags))
8365                                 rdev->recovery_offset = MaxSector;
8366                 spin_unlock_irq(&conf->device_lock);
8367                 wake_up(&conf->wait_for_overlap);
8368
8369                 if (conf->mddev->queue)
8370                         raid5_set_io_opt(conf);
8371         }
8372 }
8373
8374 /* This is called from the raid5d thread with mddev_lock held.
8375  * It makes config changes to the device.
8376  */
8377 static void raid5_finish_reshape(struct mddev *mddev)
8378 {
8379         struct r5conf *conf = mddev->private;
8380         struct md_rdev *rdev;
8381
8382         if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8383
8384                 if (mddev->delta_disks <= 0) {
8385                         int d;
8386                         spin_lock_irq(&conf->device_lock);
8387                         mddev->degraded = raid5_calc_degraded(conf);
8388                         spin_unlock_irq(&conf->device_lock);
8389                         for (d = conf->raid_disks ;
8390                              d < conf->raid_disks - mddev->delta_disks;
8391                              d++) {
8392                                 rdev = rdev_mdlock_deref(mddev,
8393                                                          conf->disks[d].rdev);
8394                                 if (rdev)
8395                                         clear_bit(In_sync, &rdev->flags);
8396                                 rdev = rdev_mdlock_deref(mddev,
8397                                                 conf->disks[d].replacement);
8398                                 if (rdev)
8399                                         clear_bit(In_sync, &rdev->flags);
8400                         }
8401                 }
8402                 mddev->layout = conf->algorithm;
8403                 mddev->chunk_sectors = conf->chunk_sectors;
8404                 mddev->reshape_position = MaxSector;
8405                 mddev->delta_disks = 0;
8406                 mddev->reshape_backwards = 0;
8407         }
8408 }
8409
8410 static void raid5_quiesce(struct mddev *mddev, int quiesce)
8411 {
8412         struct r5conf *conf = mddev->private;
8413
8414         if (quiesce) {
8415                 /* stop all writes */
8416                 lock_all_device_hash_locks_irq(conf);
8417                 /* '2' tells resync/reshape to pause so that all
8418                  * active stripes can drain
8419                  */
8420                 r5c_flush_cache(conf, INT_MAX);
8421                 /* need a memory barrier to make sure read_one_chunk() sees
8422                  * quiesce started and reverts to slow (locked) path.
8423                  */
8424                 smp_store_release(&conf->quiesce, 2);
8425                 wait_event_cmd(conf->wait_for_quiescent,
8426                                     atomic_read(&conf->active_stripes) == 0 &&
8427                                     atomic_read(&conf->active_aligned_reads) == 0,
8428                                     unlock_all_device_hash_locks_irq(conf),
8429                                     lock_all_device_hash_locks_irq(conf));
8430                 conf->quiesce = 1;
8431                 unlock_all_device_hash_locks_irq(conf);
8432                 /* allow reshape to continue */
8433                 wake_up(&conf->wait_for_overlap);
8434         } else {
8435                 /* re-enable writes */
8436                 lock_all_device_hash_locks_irq(conf);
8437                 conf->quiesce = 0;
8438                 wake_up(&conf->wait_for_quiescent);
8439                 wake_up(&conf->wait_for_overlap);
8440                 unlock_all_device_hash_locks_irq(conf);
8441         }
8442         log_quiesce(conf, quiesce);
8443 }
8444
8445 static void *raid45_takeover_raid0(struct mddev *mddev, int level)
8446 {
8447         struct r0conf *raid0_conf = mddev->private;
8448         sector_t sectors;
8449
8450         /* for raid0 takeover only one zone is supported */
8451         if (raid0_conf->nr_strip_zones > 1) {
8452                 pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n",
8453                         mdname(mddev));
8454                 return ERR_PTR(-EINVAL);
8455         }
8456
8457         sectors = raid0_conf->strip_zone[0].zone_end;
8458         sector_div(sectors, raid0_conf->strip_zone[0].nb_dev);
8459         mddev->dev_sectors = sectors;
8460         mddev->new_level = level;
8461         mddev->new_layout = ALGORITHM_PARITY_N;
8462         mddev->new_chunk_sectors = mddev->chunk_sectors;
8463         mddev->raid_disks += 1;
8464         mddev->delta_disks = 1;
8465         /* make sure it will be not marked as dirty */
8466         mddev->recovery_cp = MaxSector;
8467
8468         return setup_conf(mddev);
8469 }
8470
8471 static void *raid5_takeover_raid1(struct mddev *mddev)
8472 {
8473         int chunksect;
8474         void *ret;
8475
8476         if (mddev->raid_disks != 2 ||
8477             mddev->degraded > 1)
8478                 return ERR_PTR(-EINVAL);
8479
8480         /* Should check if there are write-behind devices? */
8481
8482         chunksect = 64*2; /* 64K by default */
8483
8484         /* The array must be an exact multiple of chunksize */
8485         while (chunksect && (mddev->array_sectors & (chunksect-1)))
8486                 chunksect >>= 1;
8487
8488         if ((chunksect<<9) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private))
8489                 /* array size does not allow a suitable chunk size */
8490                 return ERR_PTR(-EINVAL);
8491
8492         mddev->new_level = 5;
8493         mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
8494         mddev->new_chunk_sectors = chunksect;
8495
8496         ret = setup_conf(mddev);
8497         if (!IS_ERR(ret))
8498                 mddev_clear_unsupported_flags(mddev,
8499                         UNSUPPORTED_MDDEV_FLAGS);
8500         return ret;
8501 }
8502
8503 static void *raid5_takeover_raid6(struct mddev *mddev)
8504 {
8505         int new_layout;
8506
8507         switch (mddev->layout) {
8508         case ALGORITHM_LEFT_ASYMMETRIC_6:
8509                 new_layout = ALGORITHM_LEFT_ASYMMETRIC;
8510                 break;
8511         case ALGORITHM_RIGHT_ASYMMETRIC_6:
8512                 new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
8513                 break;
8514         case ALGORITHM_LEFT_SYMMETRIC_6:
8515                 new_layout = ALGORITHM_LEFT_SYMMETRIC;
8516                 break;
8517         case ALGORITHM_RIGHT_SYMMETRIC_6:
8518                 new_layout = ALGORITHM_RIGHT_SYMMETRIC;
8519                 break;
8520         case ALGORITHM_PARITY_0_6:
8521                 new_layout = ALGORITHM_PARITY_0;
8522                 break;
8523         case ALGORITHM_PARITY_N:
8524                 new_layout = ALGORITHM_PARITY_N;
8525                 break;
8526         default:
8527                 return ERR_PTR(-EINVAL);
8528         }
8529         mddev->new_level = 5;
8530         mddev->new_layout = new_layout;
8531         mddev->delta_disks = -1;
8532         mddev->raid_disks -= 1;
8533         return setup_conf(mddev);
8534 }
8535
8536 static int raid5_check_reshape(struct mddev *mddev)
8537 {
8538         /* For a 2-drive array, the layout and chunk size can be changed
8539          * immediately as not restriping is needed.
8540          * For larger arrays we record the new value - after validation
8541          * to be used by a reshape pass.
8542          */
8543         struct r5conf *conf = mddev->private;
8544         int new_chunk = mddev->new_chunk_sectors;
8545
8546         if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
8547                 return -EINVAL;
8548         if (new_chunk > 0) {
8549                 if (!is_power_of_2(new_chunk))
8550                         return -EINVAL;
8551                 if (new_chunk < (PAGE_SIZE>>9))
8552                         return -EINVAL;
8553                 if (mddev->array_sectors & (new_chunk-1))
8554                         /* not factor of array size */
8555                         return -EINVAL;
8556         }
8557
8558         /* They look valid */
8559
8560         if (mddev->raid_disks == 2) {
8561                 /* can make the change immediately */
8562                 if (mddev->new_layout >= 0) {
8563                         conf->algorithm = mddev->new_layout;
8564                         mddev->layout = mddev->new_layout;
8565                 }
8566                 if (new_chunk > 0) {
8567                         conf->chunk_sectors = new_chunk ;
8568                         mddev->chunk_sectors = new_chunk;
8569                 }
8570                 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8571                 md_wakeup_thread(mddev->thread);
8572         }
8573         return check_reshape(mddev);
8574 }
8575
8576 static int raid6_check_reshape(struct mddev *mddev)
8577 {
8578         int new_chunk = mddev->new_chunk_sectors;
8579
8580         if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout))
8581                 return -EINVAL;
8582         if (new_chunk > 0) {
8583                 if (!is_power_of_2(new_chunk))
8584                         return -EINVAL;
8585                 if (new_chunk < (PAGE_SIZE >> 9))
8586                         return -EINVAL;
8587                 if (mddev->array_sectors & (new_chunk-1))
8588                         /* not factor of array size */
8589                         return -EINVAL;
8590         }
8591
8592         /* They look valid */
8593         return check_reshape(mddev);
8594 }
8595
8596 static void *raid5_takeover(struct mddev *mddev)
8597 {
8598         /* raid5 can take over:
8599          *  raid0 - if there is only one strip zone - make it a raid4 layout
8600          *  raid1 - if there are two drives.  We need to know the chunk size
8601          *  raid4 - trivial - just use a raid4 layout.
8602          *  raid6 - Providing it is a *_6 layout
8603          */
8604         if (mddev->level == 0)
8605                 return raid45_takeover_raid0(mddev, 5);
8606         if (mddev->level == 1)
8607                 return raid5_takeover_raid1(mddev);
8608         if (mddev->level == 4) {
8609                 mddev->new_layout = ALGORITHM_PARITY_N;
8610                 mddev->new_level = 5;
8611                 return setup_conf(mddev);
8612         }
8613         if (mddev->level == 6)
8614                 return raid5_takeover_raid6(mddev);
8615
8616         return ERR_PTR(-EINVAL);
8617 }
8618
8619 static void *raid4_takeover(struct mddev *mddev)
8620 {
8621         /* raid4 can take over:
8622          *  raid0 - if there is only one strip zone
8623          *  raid5 - if layout is right
8624          */
8625         if (mddev->level == 0)
8626                 return raid45_takeover_raid0(mddev, 4);
8627         if (mddev->level == 5 &&
8628             mddev->layout == ALGORITHM_PARITY_N) {
8629                 mddev->new_layout = 0;
8630                 mddev->new_level = 4;
8631                 return setup_conf(mddev);
8632         }
8633         return ERR_PTR(-EINVAL);
8634 }
8635
8636 static struct md_personality raid5_personality;
8637
8638 static void *raid6_takeover(struct mddev *mddev)
8639 {
8640         /* Currently can only take over a raid5.  We map the
8641          * personality to an equivalent raid6 personality
8642          * with the Q block at the end.
8643          */
8644         int new_layout;
8645
8646         if (mddev->pers != &raid5_personality)
8647                 return ERR_PTR(-EINVAL);
8648         if (mddev->degraded > 1)
8649                 return ERR_PTR(-EINVAL);
8650         if (mddev->raid_disks > 253)
8651                 return ERR_PTR(-EINVAL);
8652         if (mddev->raid_disks < 3)
8653                 return ERR_PTR(-EINVAL);
8654
8655         switch (mddev->layout) {
8656         case ALGORITHM_LEFT_ASYMMETRIC:
8657                 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
8658                 break;
8659         case ALGORITHM_RIGHT_ASYMMETRIC:
8660                 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
8661                 break;
8662         case ALGORITHM_LEFT_SYMMETRIC:
8663                 new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
8664                 break;
8665         case ALGORITHM_RIGHT_SYMMETRIC:
8666                 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
8667                 break;
8668         case ALGORITHM_PARITY_0:
8669                 new_layout = ALGORITHM_PARITY_0_6;
8670                 break;
8671         case ALGORITHM_PARITY_N:
8672                 new_layout = ALGORITHM_PARITY_N;
8673                 break;
8674         default:
8675                 return ERR_PTR(-EINVAL);
8676         }
8677         mddev->new_level = 6;
8678         mddev->new_layout = new_layout;
8679         mddev->delta_disks = 1;
8680         mddev->raid_disks += 1;
8681         return setup_conf(mddev);
8682 }
8683
8684 static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
8685 {
8686         struct r5conf *conf;
8687         int err;
8688
8689         err = mddev_lock(mddev);
8690         if (err)
8691                 return err;
8692         conf = mddev->private;
8693         if (!conf) {
8694                 mddev_unlock(mddev);
8695                 return -ENODEV;
8696         }
8697
8698         if (strncmp(buf, "ppl", 3) == 0) {
8699                 /* ppl only works with RAID 5 */
8700                 if (!raid5_has_ppl(conf) && conf->level == 5) {
8701                         err = log_init(conf, NULL, true);
8702                         if (!err) {
8703                                 err = resize_stripes(conf, conf->pool_size);
8704                                 if (err)
8705                                         log_exit(conf);
8706                         }
8707                 } else
8708                         err = -EINVAL;
8709         } else if (strncmp(buf, "resync", 6) == 0) {
8710                 if (raid5_has_ppl(conf)) {
8711                         mddev_suspend(mddev);
8712                         log_exit(conf);
8713                         mddev_resume(mddev);
8714                         err = resize_stripes(conf, conf->pool_size);
8715                 } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
8716                            r5l_log_disk_error(conf)) {
8717                         bool journal_dev_exists = false;
8718                         struct md_rdev *rdev;
8719
8720                         rdev_for_each(rdev, mddev)
8721                                 if (test_bit(Journal, &rdev->flags)) {
8722                                         journal_dev_exists = true;
8723                                         break;
8724                                 }
8725
8726                         if (!journal_dev_exists) {
8727                                 mddev_suspend(mddev);
8728                                 clear_bit(MD_HAS_JOURNAL, &mddev->flags);
8729                                 mddev_resume(mddev);
8730                         } else  /* need remove journal device first */
8731                                 err = -EBUSY;
8732                 } else
8733                         err = -EINVAL;
8734         } else {
8735                 err = -EINVAL;
8736         }
8737
8738         if (!err)
8739                 md_update_sb(mddev, 1);
8740
8741         mddev_unlock(mddev);
8742
8743         return err;
8744 }
8745
8746 static int raid5_start(struct mddev *mddev)
8747 {
8748         struct r5conf *conf = mddev->private;
8749
8750         return r5l_start(conf->log);
8751 }
8752
8753 static struct md_personality raid6_personality =
8754 {
8755         .name           = "raid6",
8756         .level          = 6,
8757         .owner          = THIS_MODULE,
8758         .make_request   = raid5_make_request,
8759         .run            = raid5_run,
8760         .start          = raid5_start,
8761         .free           = raid5_free,
8762         .status         = raid5_status,
8763         .error_handler  = raid5_error,
8764         .hot_add_disk   = raid5_add_disk,
8765         .hot_remove_disk= raid5_remove_disk,
8766         .spare_active   = raid5_spare_active,
8767         .sync_request   = raid5_sync_request,
8768         .resize         = raid5_resize,
8769         .size           = raid5_size,
8770         .check_reshape  = raid6_check_reshape,
8771         .start_reshape  = raid5_start_reshape,
8772         .finish_reshape = raid5_finish_reshape,
8773         .quiesce        = raid5_quiesce,
8774         .takeover       = raid6_takeover,
8775         .change_consistency_policy = raid5_change_consistency_policy,
8776 };
8777 static struct md_personality raid5_personality =
8778 {
8779         .name           = "raid5",
8780         .level          = 5,
8781         .owner          = THIS_MODULE,
8782         .make_request   = raid5_make_request,
8783         .run            = raid5_run,
8784         .start          = raid5_start,
8785         .free           = raid5_free,
8786         .status         = raid5_status,
8787         .error_handler  = raid5_error,
8788         .hot_add_disk   = raid5_add_disk,
8789         .hot_remove_disk= raid5_remove_disk,
8790         .spare_active   = raid5_spare_active,
8791         .sync_request   = raid5_sync_request,
8792         .resize         = raid5_resize,
8793         .size           = raid5_size,
8794         .check_reshape  = raid5_check_reshape,
8795         .start_reshape  = raid5_start_reshape,
8796         .finish_reshape = raid5_finish_reshape,
8797         .quiesce        = raid5_quiesce,
8798         .takeover       = raid5_takeover,
8799         .change_consistency_policy = raid5_change_consistency_policy,
8800 };
8801
8802 static struct md_personality raid4_personality =
8803 {
8804         .name           = "raid4",
8805         .level          = 4,
8806         .owner          = THIS_MODULE,
8807         .make_request   = raid5_make_request,
8808         .run            = raid5_run,
8809         .start          = raid5_start,
8810         .free           = raid5_free,
8811         .status         = raid5_status,
8812         .error_handler  = raid5_error,
8813         .hot_add_disk   = raid5_add_disk,
8814         .hot_remove_disk= raid5_remove_disk,
8815         .spare_active   = raid5_spare_active,
8816         .sync_request   = raid5_sync_request,
8817         .resize         = raid5_resize,
8818         .size           = raid5_size,
8819         .check_reshape  = raid5_check_reshape,
8820         .start_reshape  = raid5_start_reshape,
8821         .finish_reshape = raid5_finish_reshape,
8822         .quiesce        = raid5_quiesce,
8823         .takeover       = raid4_takeover,
8824         .change_consistency_policy = raid5_change_consistency_policy,
8825 };
8826
8827 static int __init raid5_init(void)
8828 {
8829         int ret;
8830
8831         raid5_wq = alloc_workqueue("raid5wq",
8832                 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0);
8833         if (!raid5_wq)
8834                 return -ENOMEM;
8835
8836         ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE,
8837                                       "md/raid5:prepare",
8838                                       raid456_cpu_up_prepare,
8839                                       raid456_cpu_dead);
8840         if (ret) {
8841                 destroy_workqueue(raid5_wq);
8842                 return ret;
8843         }
8844         register_md_personality(&raid6_personality);
8845         register_md_personality(&raid5_personality);
8846         register_md_personality(&raid4_personality);
8847         return 0;
8848 }
8849
8850 static void raid5_exit(void)
8851 {
8852         unregister_md_personality(&raid6_personality);
8853         unregister_md_personality(&raid5_personality);
8854         unregister_md_personality(&raid4_personality);
8855         cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE);
8856         destroy_workqueue(raid5_wq);
8857 }
8858
8859 module_init(raid5_init);
8860 module_exit(raid5_exit);
8861 MODULE_LICENSE("GPL");
8862 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
8863 MODULE_ALIAS("md-personality-4"); /* RAID5 */
8864 MODULE_ALIAS("md-raid5");
8865 MODULE_ALIAS("md-raid4");
8866 MODULE_ALIAS("md-level-5");
8867 MODULE_ALIAS("md-level-4");
8868 MODULE_ALIAS("md-personality-8"); /* RAID6 */
8869 MODULE_ALIAS("md-raid6");
8870 MODULE_ALIAS("md-level-6");
8871
8872 /* This used to be two separate modules, they were: */
8873 MODULE_ALIAS("raid5");
8874 MODULE_ALIAS("raid6");