drivers/md/md-cluster.c

   1 /*
   2  * Copyright (C) 2015, SUSE
   3  *
   4  * This program is free software; you can redistribute it and/or modify
   5  * it under the terms of the GNU General Public License as published by
   6  * the Free Software Foundation; either version 2, or (at your option)
   7  * any later version.
   8  *
   9  */
  10
  11
  12 #include <linux/module.h>
  13 #include <linux/kthread.h>
  14 #include <linux/dlm.h>
  15 #include <linux/sched.h>
  16 #include <linux/raid/md_p.h>
  17 #include "md.h"
  18 #include "bitmap.h"
  19 #include "md-cluster.h"
  20
  21 #define LVB_SIZE        64
  22 #define NEW_DEV_TIMEOUT 5000
  23
  24 struct dlm_lock_resource {
  25         dlm_lockspace_t *ls;
  26         struct dlm_lksb lksb;
  27         char *name; /* lock name. */
  28         uint32_t flags; /* flags to pass to dlm_lock() */
  29         wait_queue_head_t sync_locking; /* wait queue for synchronized locking */
  30         bool sync_locking_done;
  31         void (*bast)(void *arg, int mode); /* blocking AST function pointer*/
  32         struct mddev *mddev; /* pointing back to mddev. */
  33         int mode;
  34 };
  35
  36 struct suspend_info {
  37         int slot;
  38         sector_t lo;
  39         sector_t hi;
  40         struct list_head list;
  41 };
  42
  43 struct resync_info {
  44         __le64 lo;
  45         __le64 hi;
  46 };
  47
  48 /* md_cluster_info flags */
  49 #define         MD_CLUSTER_WAITING_FOR_NEWDISK          1
  50 #define         MD_CLUSTER_SUSPEND_READ_BALANCING       2
  51 #define         MD_CLUSTER_BEGIN_JOIN_CLUSTER           3
  52
  53 /* Lock the send communication. This is done through
  54  * bit manipulation as opposed to a mutex in order to
  55  * accomodate lock and hold. See next comment.
  56  */
  57 #define         MD_CLUSTER_SEND_LOCK                    4
  58 /* If cluster operations (such as adding a disk) must lock the
  59  * communication channel, so as to perform extra operations
  60  * (update metadata) and no other operation is allowed on the
  61  * MD. Token needs to be locked and held until the operation
  62  * completes witha md_update_sb(), which would eventually release
  63  * the lock.
  64  */
  65 #define         MD_CLUSTER_SEND_LOCKED_ALREADY          5
  66 /* We should receive message after node joined cluster and
  67  * set up all the related infos such as bitmap and personality */
  68 #define         MD_CLUSTER_ALREADY_IN_CLUSTER           6
  69 #define         MD_CLUSTER_PENDING_RECV_EVENT           7
  70
  71
  72 struct md_cluster_info {
  73         /* dlm lock space and resources for clustered raid. */
  74         dlm_lockspace_t *lockspace;
  75         int slot_number;
  76         struct completion completion;
  77         struct mutex recv_mutex;
  78         struct dlm_lock_resource *bitmap_lockres;
  79         struct dlm_lock_resource **other_bitmap_lockres;
  80         struct dlm_lock_resource *resync_lockres;
  81         struct list_head suspend_list;
  82         spinlock_t suspend_lock;
  83         struct md_thread *recovery_thread;
  84         unsigned long recovery_map;
  85         /* communication loc resources */
  86         struct dlm_lock_resource *ack_lockres;
  87         struct dlm_lock_resource *message_lockres;
  88         struct dlm_lock_resource *token_lockres;
  89         struct dlm_lock_resource *no_new_dev_lockres;
  90         struct md_thread *recv_thread;
  91         struct completion newdisk_completion;
  92         wait_queue_head_t wait;
  93         unsigned long state;
  94         /* record the region in RESYNCING message */
  95         sector_t sync_low;
  96         sector_t sync_hi;
  97 };
  98
  99 enum msg_type {
 100         METADATA_UPDATED = 0,
 101         RESYNCING,
 102         NEWDISK,
 103         REMOVE,
 104         RE_ADD,
 105         BITMAP_NEEDS_SYNC,
 106 };
 107
 108 struct cluster_msg {
 109         __le32 type;
 110         __le32 slot;
 111         /* TODO: Unionize this for smaller footprint */
 112         __le64 low;
 113         __le64 high;
 114         char uuid[16];
 115         __le32 raid_slot;
 116 };
 117
 118 static void sync_ast(void *arg)
 119 {
 120         struct dlm_lock_resource *res;
 121
 122         res = arg;
 123         res->sync_locking_done = true;
 124         wake_up(&res->sync_locking);
 125 }
 126
 127 static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
 128 {
 129         int ret = 0;
 130
 131         ret = dlm_lock(res->ls, mode, &res->lksb,
 132                         res->flags, res->name, strlen(res->name),
 133                         0, sync_ast, res, res->bast);
 134         if (ret)
 135                 return ret;
 136         wait_event(res->sync_locking, res->sync_locking_done);
 137         res->sync_locking_done = false;
 138         if (res->lksb.sb_status == 0)
 139                 res->mode = mode;
 140         return res->lksb.sb_status;
 141 }
 142
 143 static int dlm_unlock_sync(struct dlm_lock_resource *res)
 144 {
 145         return dlm_lock_sync(res, DLM_LOCK_NL);
 146 }
 147
 148 /*
 149  * An variation of dlm_lock_sync, which make lock request could
 150  * be interrupted
 151  */
 152 static int dlm_lock_sync_interruptible(struct dlm_lock_resource *res, int mode,
 153                                        struct mddev *mddev)
 154 {
 155         int ret = 0;
 156
 157         ret = dlm_lock(res->ls, mode, &res->lksb,
 158                         res->flags, res->name, strlen(res->name),
 159                         0, sync_ast, res, res->bast);
 160         if (ret)
 161                 return ret;
 162
 163         wait_event(res->sync_locking, res->sync_locking_done
 164                                       || kthread_should_stop()
 165                                       || test_bit(MD_CLOSING, &mddev->flags));
 166         if (!res->sync_locking_done) {
 167                 /*
 168                  * the convert queue contains the lock request when request is
 169                  * interrupted, and sync_ast could still be run, so need to
 170                  * cancel the request and reset completion
 171                  */
 172                 ret = dlm_unlock(res->ls, res->lksb.sb_lkid, DLM_LKF_CANCEL,
 173                         &res->lksb, res);
 174                 res->sync_locking_done = false;
 175                 if (unlikely(ret != 0))
 176                         pr_info("failed to cancel previous lock request "
 177                                  "%s return %d\n", res->name, ret);
 178                 return -EPERM;
 179         } else
 180                 res->sync_locking_done = false;
 181         if (res->lksb.sb_status == 0)
 182                 res->mode = mode;
 183         return res->lksb.sb_status;
 184 }
 185
 186 static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
 187                 char *name, void (*bastfn)(void *arg, int mode), int with_lvb)
 188 {
 189         struct dlm_lock_resource *res = NULL;
 190         int ret, namelen;
 191         struct md_cluster_info *cinfo = mddev->cluster_info;
 192
 193         res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
 194         if (!res)
 195                 return NULL;
 196         init_waitqueue_head(&res->sync_locking);
 197         res->sync_locking_done = false;
 198         res->ls = cinfo->lockspace;
 199         res->mddev = mddev;
 200         res->mode = DLM_LOCK_IV;
 201         namelen = strlen(name);
 202         res->name = kzalloc(namelen + 1, GFP_KERNEL);
 203         if (!res->name) {
 204                 pr_err("md-cluster: Unable to allocate resource name for resource %s\n", name);
 205                 goto out_err;
 206         }
 207         strlcpy(res->name, name, namelen + 1);
 208         if (with_lvb) {
 209                 res->lksb.sb_lvbptr = kzalloc(LVB_SIZE, GFP_KERNEL);
 210                 if (!res->lksb.sb_lvbptr) {
 211                         pr_err("md-cluster: Unable to allocate LVB for resource %s\n", name);
 212                         goto out_err;
 213                 }
 214                 res->flags = DLM_LKF_VALBLK;
 215         }
 216
 217         if (bastfn)
 218                 res->bast = bastfn;
 219
 220         res->flags |= DLM_LKF_EXPEDITE;
 221
 222         ret = dlm_lock_sync(res, DLM_LOCK_NL);
 223         if (ret) {
 224                 pr_err("md-cluster: Unable to lock NL on new lock resource %s\n", name);
 225                 goto out_err;
 226         }
 227         res->flags &= ~DLM_LKF_EXPEDITE;
 228         res->flags |= DLM_LKF_CONVERT;
 229
 230         return res;
 231 out_err:
 232         kfree(res->lksb.sb_lvbptr);
 233         kfree(res->name);
 234         kfree(res);
 235         return NULL;
 236 }
 237
 238 static void lockres_free(struct dlm_lock_resource *res)
 239 {
 240         int ret = 0;
 241
 242         if (!res)
 243                 return;
 244
 245         /*
 246          * use FORCEUNLOCK flag, so we can unlock even the lock is on the
 247          * waiting or convert queue
 248          */
 249         ret = dlm_unlock(res->ls, res->lksb.sb_lkid, DLM_LKF_FORCEUNLOCK,
 250                 &res->lksb, res);
 251         if (unlikely(ret != 0))
 252                 pr_err("failed to unlock %s return %d\n", res->name, ret);
 253         else
 254                 wait_event(res->sync_locking, res->sync_locking_done);
 255
 256         kfree(res->name);
 257         kfree(res->lksb.sb_lvbptr);
 258         kfree(res);
 259 }
 260
 261 static void add_resync_info(struct dlm_lock_resource *lockres,
 262                             sector_t lo, sector_t hi)
 263 {
 264         struct resync_info *ri;
 265
 266         ri = (struct resync_info *)lockres->lksb.sb_lvbptr;
 267         ri->lo = cpu_to_le64(lo);
 268         ri->hi = cpu_to_le64(hi);
 269 }
 270
 271 static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres)
 272 {
 273         struct resync_info ri;
 274         struct suspend_info *s = NULL;
 275         sector_t hi = 0;
 276
 277         dlm_lock_sync(lockres, DLM_LOCK_CR);
 278         memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
 279         hi = le64_to_cpu(ri.hi);
 280         if (hi > 0) {
 281                 s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
 282                 if (!s)
 283                         goto out;
 284                 s->hi = hi;
 285                 s->lo = le64_to_cpu(ri.lo);
 286         }
 287         dlm_unlock_sync(lockres);
 288 out:
 289         return s;
 290 }
 291
 292 static void recover_bitmaps(struct md_thread *thread)
 293 {
 294         struct mddev *mddev = thread->mddev;
 295         struct md_cluster_info *cinfo = mddev->cluster_info;
 296         struct dlm_lock_resource *bm_lockres;
 297         char str[64];
 298         int slot, ret;
 299         struct suspend_info *s, *tmp;
 300         sector_t lo, hi;
 301
 302         while (cinfo->recovery_map) {
 303                 slot = fls64((u64)cinfo->recovery_map) - 1;
 304
 305                 snprintf(str, 64, "bitmap%04d", slot);
 306                 bm_lockres = lockres_init(mddev, str, NULL, 1);
 307                 if (!bm_lockres) {
 308                         pr_err("md-cluster: Cannot initialize bitmaps\n");
 309                         goto clear_bit;
 310                 }
 311
 312                 ret = dlm_lock_sync_interruptible(bm_lockres, DLM_LOCK_PW, mddev);
 313                 if (ret) {
 314                         pr_err("md-cluster: Could not DLM lock %s: %d\n",
 315                                         str, ret);
 316                         goto clear_bit;
 317                 }
 318                 ret = bitmap_copy_from_slot(mddev, slot, &lo, &hi, true);
 319                 if (ret) {
 320                         pr_err("md-cluster: Could not copy data from bitmap %d\n", slot);
 321                         goto clear_bit;
 322                 }
 323
 324                 /* Clear suspend_area associated with the bitmap */
 325                 spin_lock_irq(&cinfo->suspend_lock);
 326                 list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
 327                         if (slot == s->slot) {
 328                                 list_del(&s->list);
 329                                 kfree(s);
 330                         }
 331                 spin_unlock_irq(&cinfo->suspend_lock);
 332
 333                 if (hi > 0) {
 334                         if (lo < mddev->recovery_cp)
 335                                 mddev->recovery_cp = lo;
 336                         /* wake up thread to continue resync in case resync
 337                          * is not finished */
 338                         if (mddev->recovery_cp != MaxSector) {
 339                             set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 340                             md_wakeup_thread(mddev->thread);
 341                         }
 342                 }
 343 clear_bit:
 344                 lockres_free(bm_lockres);
 345                 clear_bit(slot, &cinfo->recovery_map);
 346         }
 347 }
 348
 349 static void recover_prep(void *arg)
 350 {
 351         struct mddev *mddev = arg;
 352         struct md_cluster_info *cinfo = mddev->cluster_info;
 353         set_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
 354 }
 355
 356 static void __recover_slot(struct mddev *mddev, int slot)
 357 {
 358         struct md_cluster_info *cinfo = mddev->cluster_info;
 359
 360         set_bit(slot, &cinfo->recovery_map);
 361         if (!cinfo->recovery_thread) {
 362                 cinfo->recovery_thread = md_register_thread(recover_bitmaps,
 363                                 mddev, "recover");
 364                 if (!cinfo->recovery_thread) {
 365                         pr_warn("md-cluster: Could not create recovery thread\n");
 366                         return;
 367                 }
 368         }
 369         md_wakeup_thread(cinfo->recovery_thread);
 370 }
 371
 372 static void recover_slot(void *arg, struct dlm_slot *slot)
 373 {
 374         struct mddev *mddev = arg;
 375         struct md_cluster_info *cinfo = mddev->cluster_info;
 376
 377         pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
 378                         mddev->bitmap_info.cluster_name,
 379                         slot->nodeid, slot->slot,
 380                         cinfo->slot_number);
 381         /* deduct one since dlm slot starts from one while the num of
 382          * cluster-md begins with 0 */
 383         __recover_slot(mddev, slot->slot - 1);
 384 }
 385
 386 static void recover_done(void *arg, struct dlm_slot *slots,
 387                 int num_slots, int our_slot,
 388                 uint32_t generation)
 389 {
 390         struct mddev *mddev = arg;
 391         struct md_cluster_info *cinfo = mddev->cluster_info;
 392
 393         cinfo->slot_number = our_slot;
 394         /* completion is only need to be complete when node join cluster,
 395          * it doesn't need to run during another node's failure */
 396         if (test_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state)) {
 397                 complete(&cinfo->completion);
 398                 clear_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
 399         }
 400         clear_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
 401 }
 402
 403 /* the ops is called when node join the cluster, and do lock recovery
 404  * if node failure occurs */
 405 static const struct dlm_lockspace_ops md_ls_ops = {
 406         .recover_prep = recover_prep,
 407         .recover_slot = recover_slot,
 408         .recover_done = recover_done,
 409 };
 410
 411 /*
 412  * The BAST function for the ack lock resource
 413  * This function wakes up the receive thread in
 414  * order to receive and process the message.
 415  */
 416 static void ack_bast(void *arg, int mode)
 417 {
 418         struct dlm_lock_resource *res = arg;
 419         struct md_cluster_info *cinfo = res->mddev->cluster_info;
 420
 421         if (mode == DLM_LOCK_EX) {
 422                 if (test_bit(MD_CLUSTER_ALREADY_IN_CLUSTER, &cinfo->state))
 423                         md_wakeup_thread(cinfo->recv_thread);
 424                 else
 425                         set_bit(MD_CLUSTER_PENDING_RECV_EVENT, &cinfo->state);
 426         }
 427 }
 428
 429 static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
 430 {
 431         struct suspend_info *s, *tmp;
 432
 433         list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
 434                 if (slot == s->slot) {
 435                         list_del(&s->list);
 436                         kfree(s);
 437                         break;
 438                 }
 439 }
 440
 441 static void remove_suspend_info(struct mddev *mddev, int slot)
 442 {
 443         struct md_cluster_info *cinfo = mddev->cluster_info;
 444         spin_lock_irq(&cinfo->suspend_lock);
 445         __remove_suspend_info(cinfo, slot);
 446         spin_unlock_irq(&cinfo->suspend_lock);
 447         mddev->pers->quiesce(mddev, 2);
 448 }
 449
 450
 451 static void process_suspend_info(struct mddev *mddev,
 452                 int slot, sector_t lo, sector_t hi)
 453 {
 454         struct md_cluster_info *cinfo = mddev->cluster_info;
 455         struct suspend_info *s;
 456
 457         if (!hi) {
 458                 remove_suspend_info(mddev, slot);
 459                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 460                 md_wakeup_thread(mddev->thread);
 461                 return;
 462         }
 463
 464         /*
 465          * The bitmaps are not same for different nodes
 466          * if RESYNCING is happening in one node, then
 467          * the node which received the RESYNCING message
 468          * probably will perform resync with the region
 469          * [lo, hi] again, so we could reduce resync time
 470          * a lot if we can ensure that the bitmaps among
 471          * different nodes are match up well.
 472          *
 473          * sync_low/hi is used to record the region which
 474          * arrived in the previous RESYNCING message,
 475          *
 476          * Call bitmap_sync_with_cluster to clear
 477          * NEEDED_MASK and set RESYNC_MASK since
 478          * resync thread is running in another node,
 479          * so we don't need to do the resync again
 480          * with the same section */
 481         bitmap_sync_with_cluster(mddev, cinfo->sync_low,
 482                                         cinfo->sync_hi,
 483                                         lo, hi);
 484         cinfo->sync_low = lo;
 485         cinfo->sync_hi = hi;
 486
 487         s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
 488         if (!s)
 489                 return;
 490         s->slot = slot;
 491         s->lo = lo;
 492         s->hi = hi;
 493         mddev->pers->quiesce(mddev, 1);
 494         mddev->pers->quiesce(mddev, 0);
 495         spin_lock_irq(&cinfo->suspend_lock);
 496         /* Remove existing entry (if exists) before adding */
 497         __remove_suspend_info(cinfo, slot);
 498         list_add(&s->list, &cinfo->suspend_list);
 499         spin_unlock_irq(&cinfo->suspend_lock);
 500         mddev->pers->quiesce(mddev, 2);
 501 }
 502
 503 static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
 504 {
 505         char disk_uuid[64];
 506         struct md_cluster_info *cinfo = mddev->cluster_info;
 507         char event_name[] = "EVENT=ADD_DEVICE";
 508         char raid_slot[16];
 509         char *envp[] = {event_name, disk_uuid, raid_slot, NULL};
 510         int len;
 511
 512         len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
 513         sprintf(disk_uuid + len, "%pU", cmsg->uuid);
 514         snprintf(raid_slot, 16, "RAID_DISK=%d", le32_to_cpu(cmsg->raid_slot));
 515         pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
 516         init_completion(&cinfo->newdisk_completion);
 517         set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
 518         kobject_uevent_env(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE, envp);
 519         wait_for_completion_timeout(&cinfo->newdisk_completion,
 520                         NEW_DEV_TIMEOUT);
 521         clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
 522 }
 523
 524
 525 static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
 526 {
 527         struct md_cluster_info *cinfo = mddev->cluster_info;
 528         mddev->good_device_nr = le32_to_cpu(msg->raid_slot);
 529         set_bit(MD_RELOAD_SB, &mddev->flags);
 530         dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
 531         md_wakeup_thread(mddev->thread);
 532 }
 533
 534 static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
 535 {
 536         struct md_rdev *rdev;
 537
 538         rcu_read_lock();
 539         rdev = md_find_rdev_nr_rcu(mddev, le32_to_cpu(msg->raid_slot));
 540         if (rdev) {
 541                 set_bit(ClusterRemove, &rdev->flags);
 542                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 543                 md_wakeup_thread(mddev->thread);
 544         }
 545         else
 546                 pr_warn("%s: %d Could not find disk(%d) to REMOVE\n",
 547                         __func__, __LINE__, le32_to_cpu(msg->raid_slot));
 548         rcu_read_unlock();
 549 }
 550
 551 static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg)
 552 {
 553         struct md_rdev *rdev;
 554
 555         rcu_read_lock();
 556         rdev = md_find_rdev_nr_rcu(mddev, le32_to_cpu(msg->raid_slot));
 557         if (rdev && test_bit(Faulty, &rdev->flags))
 558                 clear_bit(Faulty, &rdev->flags);
 559         else
 560                 pr_warn("%s: %d Could not find disk(%d) which is faulty",
 561                         __func__, __LINE__, le32_to_cpu(msg->raid_slot));
 562         rcu_read_unlock();
 563 }
 564
 565 static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
 566 {
 567         int ret = 0;
 568
 569         if (WARN(mddev->cluster_info->slot_number - 1 == le32_to_cpu(msg->slot),
 570                 "node %d received it's own msg\n", le32_to_cpu(msg->slot)))
 571                 return -1;
 572         switch (le32_to_cpu(msg->type)) {
 573         case METADATA_UPDATED:
 574                 process_metadata_update(mddev, msg);
 575                 break;
 576         case RESYNCING:
 577                 process_suspend_info(mddev, le32_to_cpu(msg->slot),
 578                                      le64_to_cpu(msg->low),
 579                                      le64_to_cpu(msg->high));
 580                 break;
 581         case NEWDISK:
 582                 process_add_new_disk(mddev, msg);
 583                 break;
 584         case REMOVE:
 585                 process_remove_disk(mddev, msg);
 586                 break;
 587         case RE_ADD:
 588                 process_readd_disk(mddev, msg);
 589                 break;
 590         case BITMAP_NEEDS_SYNC:
 591                 __recover_slot(mddev, le32_to_cpu(msg->slot));
 592                 break;
 593         default:
 594                 ret = -1;
 595                 pr_warn("%s:%d Received unknown message from %d\n",
 596                         __func__, __LINE__, msg->slot);
 597         }
 598         return ret;
 599 }
 600
 601 /*
 602  * thread for receiving message
 603  */
 604 static void recv_daemon(struct md_thread *thread)
 605 {
 606         struct md_cluster_info *cinfo = thread->mddev->cluster_info;
 607         struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres;
 608         struct dlm_lock_resource *message_lockres = cinfo->message_lockres;
 609         struct cluster_msg msg;
 610         int ret;
 611
 612         mutex_lock(&cinfo->recv_mutex);
 613         /*get CR on Message*/
 614         if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
 615                 pr_err("md/raid1:failed to get CR on MESSAGE\n");
 616                 mutex_unlock(&cinfo->recv_mutex);
 617                 return;
 618         }
 619
 620         /* read lvb and wake up thread to process this message_lockres */
 621         memcpy(&msg, message_lockres->lksb.sb_lvbptr, sizeof(struct cluster_msg));
 622         ret = process_recvd_msg(thread->mddev, &msg);
 623         if (ret)
 624                 goto out;
 625
 626         /*release CR on ack_lockres*/
 627         ret = dlm_unlock_sync(ack_lockres);
 628         if (unlikely(ret != 0))
 629                 pr_info("unlock ack failed return %d\n", ret);
 630         /*up-convert to PR on message_lockres*/
 631         ret = dlm_lock_sync(message_lockres, DLM_LOCK_PR);
 632         if (unlikely(ret != 0))
 633                 pr_info("lock PR on msg failed return %d\n", ret);
 634         /*get CR on ack_lockres again*/
 635         ret = dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
 636         if (unlikely(ret != 0))
 637                 pr_info("lock CR on ack failed return %d\n", ret);
 638 out:
 639         /*release CR on message_lockres*/
 640         ret = dlm_unlock_sync(message_lockres);
 641         if (unlikely(ret != 0))
 642                 pr_info("unlock msg failed return %d\n", ret);
 643         mutex_unlock(&cinfo->recv_mutex);
 644 }
 645
 646 /* lock_token()
 647  * Takes the lock on the TOKEN lock resource so no other
 648  * node can communicate while the operation is underway.
 649  */
 650 static int lock_token(struct md_cluster_info *cinfo)
 651 {
 652         int error;
 653
 654         error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
 655         if (error)
 656                 pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
 657                                 __func__, __LINE__, error);
 658
 659         /* Lock the receive sequence */
 660         mutex_lock(&cinfo->recv_mutex);
 661         return error;
 662 }
 663
 664 /* lock_comm()
 665  * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel.
 666  */
 667 static int lock_comm(struct md_cluster_info *cinfo)
 668 {
 669         wait_event(cinfo->wait,
 670                    !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state));
 671
 672         return lock_token(cinfo);
 673 }
 674
 675 static void unlock_comm(struct md_cluster_info *cinfo)
 676 {
 677         WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX);
 678         mutex_unlock(&cinfo->recv_mutex);
 679         dlm_unlock_sync(cinfo->token_lockres);
 680         clear_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state);
 681         wake_up(&cinfo->wait);
 682 }
 683
 684 /* __sendmsg()
 685  * This function performs the actual sending of the message. This function is
 686  * usually called after performing the encompassing operation
 687  * The function:
 688  * 1. Grabs the message lockresource in EX mode
 689  * 2. Copies the message to the message LVB
 690  * 3. Downconverts message lockresource to CW
 691  * 4. Upconverts ack lock resource from CR to EX. This forces the BAST on other nodes
 692  *    and the other nodes read the message. The thread will wait here until all other
 693  *    nodes have released ack lock resource.
 694  * 5. Downconvert ack lockresource to CR
 695  */
 696 static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
 697 {
 698         int error;
 699         int slot = cinfo->slot_number - 1;
 700
 701         cmsg->slot = cpu_to_le32(slot);
 702         /*get EX on Message*/
 703         error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_EX);
 704         if (error) {
 705                 pr_err("md-cluster: failed to get EX on MESSAGE (%d)\n", error);
 706                 goto failed_message;
 707         }
 708
 709         memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg,
 710                         sizeof(struct cluster_msg));
 711         /*down-convert EX to CW on Message*/
 712         error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CW);
 713         if (error) {
 714                 pr_err("md-cluster: failed to convert EX to CW on MESSAGE(%d)\n",
 715                                 error);
 716                 goto failed_ack;
 717         }
 718
 719         /*up-convert CR to EX on Ack*/
 720         error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_EX);
 721         if (error) {
 722                 pr_err("md-cluster: failed to convert CR to EX on ACK(%d)\n",
 723                                 error);
 724                 goto failed_ack;
 725         }
 726
 727         /*down-convert EX to CR on Ack*/
 728         error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR);
 729         if (error) {
 730                 pr_err("md-cluster: failed to convert EX to CR on ACK(%d)\n",
 731                                 error);
 732                 goto failed_ack;
 733         }
 734
 735 failed_ack:
 736         error = dlm_unlock_sync(cinfo->message_lockres);
 737         if (unlikely(error != 0)) {
 738                 pr_err("md-cluster: failed convert to NL on MESSAGE(%d)\n",
 739                         error);
 740                 /* in case the message can't be released due to some reason */
 741                 goto failed_ack;
 742         }
 743 failed_message:
 744         return error;
 745 }
 746
 747 static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
 748 {
 749         int ret;
 750
 751         lock_comm(cinfo);
 752         ret = __sendmsg(cinfo, cmsg);
 753         unlock_comm(cinfo);
 754         return ret;
 755 }
 756
 757 static int gather_all_resync_info(struct mddev *mddev, int total_slots)
 758 {
 759         struct md_cluster_info *cinfo = mddev->cluster_info;
 760         int i, ret = 0;
 761         struct dlm_lock_resource *bm_lockres;
 762         struct suspend_info *s;
 763         char str[64];
 764         sector_t lo, hi;
 765
 766
 767         for (i = 0; i < total_slots; i++) {
 768                 memset(str, '\0', 64);
 769                 snprintf(str, 64, "bitmap%04d", i);
 770                 bm_lockres = lockres_init(mddev, str, NULL, 1);
 771                 if (!bm_lockres)
 772                         return -ENOMEM;
 773                 if (i == (cinfo->slot_number - 1)) {
 774                         lockres_free(bm_lockres);
 775                         continue;
 776                 }
 777
 778                 bm_lockres->flags |= DLM_LKF_NOQUEUE;
 779                 ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
 780                 if (ret == -EAGAIN) {
 781                         memset(bm_lockres->lksb.sb_lvbptr, '\0', LVB_SIZE);
 782                         s = read_resync_info(mddev, bm_lockres);
 783                         if (s) {
 784                                 pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n",
 785                                                 __func__, __LINE__,
 786                                                 (unsigned long long) s->lo,
 787                                                 (unsigned long long) s->hi, i);
 788                                 spin_lock_irq(&cinfo->suspend_lock);
 789                                 s->slot = i;
 790                                 list_add(&s->list, &cinfo->suspend_list);
 791                                 spin_unlock_irq(&cinfo->suspend_lock);
 792                         }
 793                         ret = 0;
 794                         lockres_free(bm_lockres);
 795                         continue;
 796                 }
 797                 if (ret) {
 798                         lockres_free(bm_lockres);
 799                         goto out;
 800                 }
 801
 802                 /* Read the disk bitmap sb and check if it needs recovery */
 803                 ret = bitmap_copy_from_slot(mddev, i, &lo, &hi, false);
 804                 if (ret) {
 805                         pr_warn("md-cluster: Could not gather bitmaps from slot %d", i);
 806                         lockres_free(bm_lockres);
 807                         continue;
 808                 }
 809                 if ((hi > 0) && (lo < mddev->recovery_cp)) {
 810                         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 811                         mddev->recovery_cp = lo;
 812                         md_check_recovery(mddev);
 813                 }
 814
 815                 lockres_free(bm_lockres);
 816         }
 817 out:
 818         return ret;
 819 }
 820
 821 static int join(struct mddev *mddev, int nodes)
 822 {
 823         struct md_cluster_info *cinfo;
 824         int ret, ops_rv;
 825         char str[64];
 826
 827         cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL);
 828         if (!cinfo)
 829                 return -ENOMEM;
 830
 831         INIT_LIST_HEAD(&cinfo->suspend_list);
 832         spin_lock_init(&cinfo->suspend_lock);
 833         init_completion(&cinfo->completion);
 834         set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
 835         init_waitqueue_head(&cinfo->wait);
 836         mutex_init(&cinfo->recv_mutex);
 837
 838         mddev->cluster_info = cinfo;
 839
 840         memset(str, 0, 64);
 841         sprintf(str, "%pU", mddev->uuid);
 842         ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name,
 843                                 DLM_LSFL_FS, LVB_SIZE,
 844                                 &md_ls_ops, mddev, &ops_rv, &cinfo->lockspace);
 845         if (ret)
 846                 goto err;
 847         wait_for_completion(&cinfo->completion);
 848         if (nodes < cinfo->slot_number) {
 849                 pr_err("md-cluster: Slot allotted(%d) is greater than available slots(%d).",
 850                         cinfo->slot_number, nodes);
 851                 ret = -ERANGE;
 852                 goto err;
 853         }
 854         /* Initiate the communication resources */
 855         ret = -ENOMEM;
 856         cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv");
 857         if (!cinfo->recv_thread) {
 858                 pr_err("md-cluster: cannot allocate memory for recv_thread!\n");
 859                 goto err;
 860         }
 861         cinfo->message_lockres = lockres_init(mddev, "message", NULL, 1);
 862         if (!cinfo->message_lockres)
 863                 goto err;
 864         cinfo->token_lockres = lockres_init(mddev, "token", NULL, 0);
 865         if (!cinfo->token_lockres)
 866                 goto err;
 867         cinfo->no_new_dev_lockres = lockres_init(mddev, "no-new-dev", NULL, 0);
 868         if (!cinfo->no_new_dev_lockres)
 869                 goto err;
 870
 871         ret = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
 872         if (ret) {
 873                 ret = -EAGAIN;
 874                 pr_err("md-cluster: can't join cluster to avoid lock issue\n");
 875                 goto err;
 876         }
 877         cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0);
 878         if (!cinfo->ack_lockres) {
 879                 ret = -ENOMEM;
 880                 goto err;
 881         }
 882         /* get sync CR lock on ACK. */
 883         if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR))
 884                 pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n",
 885                                 ret);
 886         dlm_unlock_sync(cinfo->token_lockres);
 887         /* get sync CR lock on no-new-dev. */
 888         if (dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR))
 889                 pr_err("md-cluster: failed to get a sync CR lock on no-new-dev!(%d)\n", ret);
 890
 891
 892         pr_info("md-cluster: Joined cluster %s slot %d\n", str, cinfo->slot_number);
 893         snprintf(str, 64, "bitmap%04d", cinfo->slot_number - 1);
 894         cinfo->bitmap_lockres = lockres_init(mddev, str, NULL, 1);
 895         if (!cinfo->bitmap_lockres) {
 896                 ret = -ENOMEM;
 897                 goto err;
 898         }
 899         if (dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW)) {
 900                 pr_err("Failed to get bitmap lock\n");
 901                 ret = -EINVAL;
 902                 goto err;
 903         }
 904
 905         cinfo->resync_lockres = lockres_init(mddev, "resync", NULL, 0);
 906         if (!cinfo->resync_lockres) {
 907                 ret = -ENOMEM;
 908                 goto err;
 909         }
 910
 911         return 0;
 912 err:
 913         md_unregister_thread(&cinfo->recovery_thread);
 914         md_unregister_thread(&cinfo->recv_thread);
 915         lockres_free(cinfo->message_lockres);
 916         lockres_free(cinfo->token_lockres);
 917         lockres_free(cinfo->ack_lockres);
 918         lockres_free(cinfo->no_new_dev_lockres);
 919         lockres_free(cinfo->resync_lockres);
 920         lockres_free(cinfo->bitmap_lockres);
 921         if (cinfo->lockspace)
 922                 dlm_release_lockspace(cinfo->lockspace, 2);
 923         mddev->cluster_info = NULL;
 924         kfree(cinfo);
 925         return ret;
 926 }
 927
 928 static void load_bitmaps(struct mddev *mddev, int total_slots)
 929 {
 930         struct md_cluster_info *cinfo = mddev->cluster_info;
 931
 932         /* load all the node's bitmap info for resync */
 933         if (gather_all_resync_info(mddev, total_slots))
 934                 pr_err("md-cluster: failed to gather all resyn infos\n");
 935         set_bit(MD_CLUSTER_ALREADY_IN_CLUSTER, &cinfo->state);
 936         /* wake up recv thread in case something need to be handled */
 937         if (test_and_clear_bit(MD_CLUSTER_PENDING_RECV_EVENT, &cinfo->state))
 938                 md_wakeup_thread(cinfo->recv_thread);
 939 }
 940
 941 static void resync_bitmap(struct mddev *mddev)
 942 {
 943         struct md_cluster_info *cinfo = mddev->cluster_info;
 944         struct cluster_msg cmsg = {0};
 945         int err;
 946
 947         cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
 948         err = sendmsg(cinfo, &cmsg);
 949         if (err)
 950                 pr_err("%s:%d: failed to send BITMAP_NEEDS_SYNC message (%d)\n",
 951                         __func__, __LINE__, err);
 952 }
 953
 954 static void unlock_all_bitmaps(struct mddev *mddev);
 955 static int leave(struct mddev *mddev)
 956 {
 957         struct md_cluster_info *cinfo = mddev->cluster_info;
 958
 959         if (!cinfo)
 960                 return 0;
 961
 962         /* BITMAP_NEEDS_SYNC message should be sent when node
 963          * is leaving the cluster with dirty bitmap, also we
 964          * can only deliver it when dlm connection is available */
 965         if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector)
 966                 resync_bitmap(mddev);
 967
 968         md_unregister_thread(&cinfo->recovery_thread);
 969         md_unregister_thread(&cinfo->recv_thread);
 970         lockres_free(cinfo->message_lockres);
 971         lockres_free(cinfo->token_lockres);
 972         lockres_free(cinfo->ack_lockres);
 973         lockres_free(cinfo->no_new_dev_lockres);
 974         lockres_free(cinfo->resync_lockres);
 975         lockres_free(cinfo->bitmap_lockres);
 976         unlock_all_bitmaps(mddev);
 977         dlm_release_lockspace(cinfo->lockspace, 2);
 978         kfree(cinfo);
 979         return 0;
 980 }
 981
 982 /* slot_number(): Returns the MD slot number to use
 983  * DLM starts the slot numbers from 1, wheras cluster-md
 984  * wants the number to be from zero, so we deduct one
 985  */
 986 static int slot_number(struct mddev *mddev)
 987 {
 988         struct md_cluster_info *cinfo = mddev->cluster_info;
 989
 990         return cinfo->slot_number - 1;
 991 }
 992
 993 /*
 994  * Check if the communication is already locked, else lock the communication
 995  * channel.
 996  * If it is already locked, token is in EX mode, and hence lock_token()
 997  * should not be called.
 998  */
 999 static int metadata_update_start(struct mddev *mddev)
1000 {
1001         struct md_cluster_info *cinfo = mddev->cluster_info;
1002
1003         wait_event(cinfo->wait,
1004                    !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state) ||
1005                    test_and_clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state));
1006
1007         /* If token is already locked, return 0 */
1008         if (cinfo->token_lockres->mode == DLM_LOCK_EX)
1009                 return 0;
1010
1011         return lock_token(cinfo);
1012 }
1013
1014 static int metadata_update_finish(struct mddev *mddev)
1015 {
1016         struct md_cluster_info *cinfo = mddev->cluster_info;
1017         struct cluster_msg cmsg;
1018         struct md_rdev *rdev;
1019         int ret = 0;
1020         int raid_slot = -1;
1021
1022         memset(&cmsg, 0, sizeof(cmsg));
1023         cmsg.type = cpu_to_le32(METADATA_UPDATED);
1024         /* Pick up a good active device number to send.
1025          */
1026         rdev_for_each(rdev, mddev)
1027                 if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) {
1028                         raid_slot = rdev->desc_nr;
1029                         break;
1030                 }
1031         if (raid_slot >= 0) {
1032                 cmsg.raid_slot = cpu_to_le32(raid_slot);
1033                 ret = __sendmsg(cinfo, &cmsg);
1034         } else
1035                 pr_warn("md-cluster: No good device id found to send\n");
1036         clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
1037         unlock_comm(cinfo);
1038         return ret;
1039 }
1040
1041 static void metadata_update_cancel(struct mddev *mddev)
1042 {
1043         struct md_cluster_info *cinfo = mddev->cluster_info;
1044         clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
1045         unlock_comm(cinfo);
1046 }
1047
1048 static int resync_start(struct mddev *mddev)
1049 {
1050         struct md_cluster_info *cinfo = mddev->cluster_info;
1051         return dlm_lock_sync_interruptible(cinfo->resync_lockres, DLM_LOCK_EX, mddev);
1052 }
1053
1054 static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
1055 {
1056         struct md_cluster_info *cinfo = mddev->cluster_info;
1057         struct resync_info ri;
1058         struct cluster_msg cmsg = {0};
1059
1060         /* do not send zero again, if we have sent before */
1061         if (hi == 0) {
1062                 memcpy(&ri, cinfo->bitmap_lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
1063                 if (le64_to_cpu(ri.hi) == 0)
1064                         return 0;
1065         }
1066
1067         add_resync_info(cinfo->bitmap_lockres, lo, hi);
1068         /* Re-acquire the lock to refresh LVB */
1069         dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
1070         cmsg.type = cpu_to_le32(RESYNCING);
1071         cmsg.low = cpu_to_le64(lo);
1072         cmsg.high = cpu_to_le64(hi);
1073
1074         return sendmsg(cinfo, &cmsg);
1075 }
1076
1077 static int resync_finish(struct mddev *mddev)
1078 {
1079         struct md_cluster_info *cinfo = mddev->cluster_info;
1080         dlm_unlock_sync(cinfo->resync_lockres);
1081         return resync_info_update(mddev, 0, 0);
1082 }
1083
1084 static int area_resyncing(struct mddev *mddev, int direction,
1085                 sector_t lo, sector_t hi)
1086 {
1087         struct md_cluster_info *cinfo = mddev->cluster_info;
1088         int ret = 0;
1089         struct suspend_info *s;
1090
1091         if ((direction == READ) &&
1092                 test_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state))
1093                 return 1;
1094
1095         spin_lock_irq(&cinfo->suspend_lock);
1096         if (list_empty(&cinfo->suspend_list))
1097                 goto out;
1098         list_for_each_entry(s, &cinfo->suspend_list, list)
1099                 if (hi > s->lo && lo < s->hi) {
1100                         ret = 1;
1101                         break;
1102                 }
1103 out:
1104         spin_unlock_irq(&cinfo->suspend_lock);
1105         return ret;
1106 }
1107
1108 /* add_new_disk() - initiates a disk add
1109  * However, if this fails before writing md_update_sb(),
1110  * add_new_disk_cancel() must be called to release token lock
1111  */
1112 static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
1113 {
1114         struct md_cluster_info *cinfo = mddev->cluster_info;
1115         struct cluster_msg cmsg;
1116         int ret = 0;
1117         struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1118         char *uuid = sb->device_uuid;
1119
1120         memset(&cmsg, 0, sizeof(cmsg));
1121         cmsg.type = cpu_to_le32(NEWDISK);
1122         memcpy(cmsg.uuid, uuid, 16);
1123         cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
1124         lock_comm(cinfo);
1125         ret = __sendmsg(cinfo, &cmsg);
1126         if (ret) {
1127                 unlock_comm(cinfo);
1128                 return ret;
1129         }
1130         cinfo->no_new_dev_lockres->flags |= DLM_LKF_NOQUEUE;
1131         ret = dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_EX);
1132         cinfo->no_new_dev_lockres->flags &= ~DLM_LKF_NOQUEUE;
1133         /* Some node does not "see" the device */
1134         if (ret == -EAGAIN)
1135                 ret = -ENOENT;
1136         if (ret)
1137                 unlock_comm(cinfo);
1138         else {
1139                 dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
1140                 /* Since MD_CHANGE_DEVS will be set in add_bound_rdev which
1141                  * will run soon after add_new_disk, the below path will be
1142                  * invoked:
1143                  *   md_wakeup_thread(mddev->thread)
1144                  *      -> conf->thread (raid1d)
1145                  *      -> md_check_recovery -> md_update_sb
1146                  *      -> metadata_update_start/finish
1147                  * MD_CLUSTER_SEND_LOCKED_ALREADY will be cleared eventually.
1148                  *
1149                  * For other failure cases, metadata_update_cancel and
1150                  * add_new_disk_cancel also clear below bit as well.
1151                  * */
1152                 set_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
1153                 wake_up(&cinfo->wait);
1154         }
1155         return ret;
1156 }
1157
1158 static void add_new_disk_cancel(struct mddev *mddev)
1159 {
1160         struct md_cluster_info *cinfo = mddev->cluster_info;
1161         clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
1162         unlock_comm(cinfo);
1163 }
1164
1165 static int new_disk_ack(struct mddev *mddev, bool ack)
1166 {
1167         struct md_cluster_info *cinfo = mddev->cluster_info;
1168
1169         if (!test_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state)) {
1170                 pr_warn("md-cluster(%s): Spurious cluster confirmation\n", mdname(mddev));
1171                 return -EINVAL;
1172         }
1173
1174         if (ack)
1175                 dlm_unlock_sync(cinfo->no_new_dev_lockres);
1176         complete(&cinfo->newdisk_completion);
1177         return 0;
1178 }
1179
1180 static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1181 {
1182         struct cluster_msg cmsg = {0};
1183         struct md_cluster_info *cinfo = mddev->cluster_info;
1184         cmsg.type = cpu_to_le32(REMOVE);
1185         cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
1186         return sendmsg(cinfo, &cmsg);
1187 }
1188
1189 static int lock_all_bitmaps(struct mddev *mddev)
1190 {
1191         int slot, my_slot, ret, held = 1, i = 0;
1192         char str[64];
1193         struct md_cluster_info *cinfo = mddev->cluster_info;
1194
1195         cinfo->other_bitmap_lockres = kzalloc((mddev->bitmap_info.nodes - 1) *
1196                                              sizeof(struct dlm_lock_resource *),
1197                                              GFP_KERNEL);
1198         if (!cinfo->other_bitmap_lockres) {
1199                 pr_err("md: can't alloc mem for other bitmap locks\n");
1200                 return 0;
1201         }
1202
1203         my_slot = slot_number(mddev);
1204         for (slot = 0; slot < mddev->bitmap_info.nodes; slot++) {
1205                 if (slot == my_slot)
1206                         continue;
1207
1208                 memset(str, '\0', 64);
1209                 snprintf(str, 64, "bitmap%04d", slot);
1210                 cinfo->other_bitmap_lockres[i] = lockres_init(mddev, str, NULL, 1);
1211                 if (!cinfo->other_bitmap_lockres[i])
1212                         return -ENOMEM;
1213
1214                 cinfo->other_bitmap_lockres[i]->flags |= DLM_LKF_NOQUEUE;
1215                 ret = dlm_lock_sync(cinfo->other_bitmap_lockres[i], DLM_LOCK_PW);
1216                 if (ret)
1217                         held = -1;
1218                 i++;
1219         }
1220
1221         return held;
1222 }
1223
1224 static void unlock_all_bitmaps(struct mddev *mddev)
1225 {
1226         struct md_cluster_info *cinfo = mddev->cluster_info;
1227         int i;
1228
1229         /* release other node's bitmap lock if they are existed */
1230         if (cinfo->other_bitmap_lockres) {
1231                 for (i = 0; i < mddev->bitmap_info.nodes - 1; i++) {
1232                         if (cinfo->other_bitmap_lockres[i]) {
1233                                 lockres_free(cinfo->other_bitmap_lockres[i]);
1234                         }
1235                 }
1236                 kfree(cinfo->other_bitmap_lockres);
1237                 cinfo->other_bitmap_lockres = NULL;
1238         }
1239 }
1240
1241 static int gather_bitmaps(struct md_rdev *rdev)
1242 {
1243         int sn, err;
1244         sector_t lo, hi;
1245         struct cluster_msg cmsg = {0};
1246         struct mddev *mddev = rdev->mddev;
1247         struct md_cluster_info *cinfo = mddev->cluster_info;
1248
1249         cmsg.type = cpu_to_le32(RE_ADD);
1250         cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
1251         err = sendmsg(cinfo, &cmsg);
1252         if (err)
1253                 goto out;
1254
1255         for (sn = 0; sn < mddev->bitmap_info.nodes; sn++) {
1256                 if (sn == (cinfo->slot_number - 1))
1257                         continue;
1258                 err = bitmap_copy_from_slot(mddev, sn, &lo, &hi, false);
1259                 if (err) {
1260                         pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn);
1261                         goto out;
1262                 }
1263                 if ((hi > 0) && (lo < mddev->recovery_cp))
1264                         mddev->recovery_cp = lo;
1265         }
1266 out:
1267         return err;
1268 }
1269
1270 static struct md_cluster_operations cluster_ops = {
1271         .join   = join,
1272         .leave  = leave,
1273         .slot_number = slot_number,
1274         .resync_start = resync_start,
1275         .resync_finish = resync_finish,
1276         .resync_info_update = resync_info_update,
1277         .metadata_update_start = metadata_update_start,
1278         .metadata_update_finish = metadata_update_finish,
1279         .metadata_update_cancel = metadata_update_cancel,
1280         .area_resyncing = area_resyncing,
1281         .add_new_disk = add_new_disk,
1282         .add_new_disk_cancel = add_new_disk_cancel,
1283         .new_disk_ack = new_disk_ack,
1284         .remove_disk = remove_disk,
1285         .load_bitmaps = load_bitmaps,
1286         .gather_bitmaps = gather_bitmaps,
1287         .lock_all_bitmaps = lock_all_bitmaps,
1288         .unlock_all_bitmaps = unlock_all_bitmaps,
1289 };
1290
1291 static int __init cluster_init(void)
1292 {
1293         pr_warn("md-cluster: EXPERIMENTAL. Use with caution\n");
1294         pr_info("Registering Cluster MD functions\n");
1295         register_md_cluster_operations(&cluster_ops, THIS_MODULE);
1296         return 0;
1297 }
1298
1299 static void cluster_exit(void)
1300 {
1301         unregister_md_cluster_operations();
1302 }
1303
1304 module_init(cluster_init);
1305 module_exit(cluster_exit);
1306 MODULE_AUTHOR("SUSE");
1307 MODULE_LICENSE("GPL");
1308 MODULE_DESCRIPTION("Clustering support for MD");