fs/btrfs/tree-mod-log.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "messages.h"
   4 #include "tree-mod-log.h"
   5 #include "disk-io.h"
   6 #include "fs.h"
   7 #include "accessors.h"
   8 #include "tree-checker.h"
   9
  10 struct tree_mod_root {
  11         u64 logical;
  12         u8 level;
  13 };
  14
  15 struct tree_mod_elem {
  16         struct rb_node node;
  17         u64 logical;
  18         u64 seq;
  19         enum btrfs_mod_log_op op;
  20
  21         /*
  22          * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
  23          * operations.
  24          */
  25         int slot;
  26
  27         /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
  28         u64 generation;
  29
  30         /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
  31         struct btrfs_disk_key key;
  32         u64 blockptr;
  33
  34         /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
  35         struct {
  36                 int dst_slot;
  37                 int nr_items;
  38         } move;
  39
  40         /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
  41         struct tree_mod_root old_root;
  42 };
  43
  44 /*
  45  * Pull a new tree mod seq number for our operation.
  46  */
  47 static u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
  48 {
  49         return atomic64_inc_return(&fs_info->tree_mod_seq);
  50 }
  51
  52 /*
  53  * This adds a new blocker to the tree mod log's blocker list if the @elem
  54  * passed does not already have a sequence number set. So when a caller expects
  55  * to record tree modifications, it should ensure to set elem->seq to zero
  56  * before calling btrfs_get_tree_mod_seq.
  57  * Returns a fresh, unused tree log modification sequence number, even if no new
  58  * blocker was added.
  59  */
  60 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
  61                            struct btrfs_seq_list *elem)
  62 {
  63         write_lock(&fs_info->tree_mod_log_lock);
  64         if (!elem->seq) {
  65                 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
  66                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
  67                 set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
  68         }
  69         write_unlock(&fs_info->tree_mod_log_lock);
  70
  71         return elem->seq;
  72 }
  73
  74 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
  75                             struct btrfs_seq_list *elem)
  76 {
  77         struct rb_root *tm_root;
  78         struct rb_node *node;
  79         struct rb_node *next;
  80         struct tree_mod_elem *tm;
  81         u64 min_seq = BTRFS_SEQ_LAST;
  82         u64 seq_putting = elem->seq;
  83
  84         if (!seq_putting)
  85                 return;
  86
  87         write_lock(&fs_info->tree_mod_log_lock);
  88         list_del(&elem->list);
  89         elem->seq = 0;
  90
  91         if (list_empty(&fs_info->tree_mod_seq_list)) {
  92                 clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
  93         } else {
  94                 struct btrfs_seq_list *first;
  95
  96                 first = list_first_entry(&fs_info->tree_mod_seq_list,
  97                                          struct btrfs_seq_list, list);
  98                 if (seq_putting > first->seq) {
  99                         /*
 100                          * Blocker with lower sequence number exists, we cannot
 101                          * remove anything from the log.
 102                          */
 103                         write_unlock(&fs_info->tree_mod_log_lock);
 104                         return;
 105                 }
 106                 min_seq = first->seq;
 107         }
 108
 109         /*
 110          * Anything that's lower than the lowest existing (read: blocked)
 111          * sequence number can be removed from the tree.
 112          */
 113         tm_root = &fs_info->tree_mod_log;
 114         for (node = rb_first(tm_root); node; node = next) {
 115                 next = rb_next(node);
 116                 tm = rb_entry(node, struct tree_mod_elem, node);
 117                 if (tm->seq >= min_seq)
 118                         continue;
 119                 rb_erase(node, tm_root);
 120                 kfree(tm);
 121         }
 122         write_unlock(&fs_info->tree_mod_log_lock);
 123 }
 124
 125 /*
 126  * Key order of the log:
 127  *       node/leaf start address -> sequence
 128  *
 129  * The 'start address' is the logical address of the *new* root node for root
 130  * replace operations, or the logical address of the affected block for all
 131  * other operations.
 132  */
 133 static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
 134                                         struct tree_mod_elem *tm)
 135 {
 136         struct rb_root *tm_root;
 137         struct rb_node **new;
 138         struct rb_node *parent = NULL;
 139         struct tree_mod_elem *cur;
 140
 141         lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
 142
 143         tm->seq = btrfs_inc_tree_mod_seq(fs_info);
 144
 145         tm_root = &fs_info->tree_mod_log;
 146         new = &tm_root->rb_node;
 147         while (*new) {
 148                 cur = rb_entry(*new, struct tree_mod_elem, node);
 149                 parent = *new;
 150                 if (cur->logical < tm->logical)
 151                         new = &((*new)->rb_left);
 152                 else if (cur->logical > tm->logical)
 153                         new = &((*new)->rb_right);
 154                 else if (cur->seq < tm->seq)
 155                         new = &((*new)->rb_left);
 156                 else if (cur->seq > tm->seq)
 157                         new = &((*new)->rb_right);
 158                 else
 159                         return -EEXIST;
 160         }
 161
 162         rb_link_node(&tm->node, parent, new);
 163         rb_insert_color(&tm->node, tm_root);
 164         return 0;
 165 }
 166
 167 /*
 168  * Determines if logging can be omitted. Returns true if it can. Otherwise, it
 169  * returns false with the tree_mod_log_lock acquired. The caller must hold
 170  * this until all tree mod log insertions are recorded in the rb tree and then
 171  * write unlock fs_info::tree_mod_log_lock.
 172  */
 173 static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
 174 {
 175         if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
 176                 return true;
 177         if (eb && btrfs_header_level(eb) == 0)
 178                 return true;
 179
 180         write_lock(&fs_info->tree_mod_log_lock);
 181         if (list_empty(&(fs_info)->tree_mod_seq_list)) {
 182                 write_unlock(&fs_info->tree_mod_log_lock);
 183                 return true;
 184         }
 185
 186         return false;
 187 }
 188
 189 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
 190 static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
 191                                     struct extent_buffer *eb)
 192 {
 193         if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
 194                 return false;
 195         if (eb && btrfs_header_level(eb) == 0)
 196                 return false;
 197
 198         return true;
 199 }
 200
 201 static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
 202                                                  int slot,
 203                                                  enum btrfs_mod_log_op op)
 204 {
 205         struct tree_mod_elem *tm;
 206
 207         tm = kzalloc(sizeof(*tm), GFP_NOFS);
 208         if (!tm)
 209                 return NULL;
 210
 211         tm->logical = eb->start;
 212         if (op != BTRFS_MOD_LOG_KEY_ADD) {
 213                 btrfs_node_key(eb, &tm->key, slot);
 214                 tm->blockptr = btrfs_node_blockptr(eb, slot);
 215         }
 216         tm->op = op;
 217         tm->slot = slot;
 218         tm->generation = btrfs_node_ptr_generation(eb, slot);
 219         RB_CLEAR_NODE(&tm->node);
 220
 221         return tm;
 222 }
 223
 224 int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
 225                                   enum btrfs_mod_log_op op)
 226 {
 227         struct tree_mod_elem *tm;
 228         int ret = 0;
 229
 230         if (!tree_mod_need_log(eb->fs_info, eb))
 231                 return 0;
 232
 233         tm = alloc_tree_mod_elem(eb, slot, op);
 234         if (!tm)
 235                 ret = -ENOMEM;
 236
 237         if (tree_mod_dont_log(eb->fs_info, eb)) {
 238                 kfree(tm);
 239                 /*
 240                  * Don't error if we failed to allocate memory because we don't
 241                  * need to log.
 242                  */
 243                 return 0;
 244         } else if (ret != 0) {
 245                 /*
 246                  * We previously failed to allocate memory and we need to log,
 247                  * so we have to fail.
 248                  */
 249                 goto out_unlock;
 250         }
 251
 252         ret = tree_mod_log_insert(eb->fs_info, tm);
 253 out_unlock:
 254         write_unlock(&eb->fs_info->tree_mod_log_lock);
 255         if (ret)
 256                 kfree(tm);
 257
 258         return ret;
 259 }
 260
 261 static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb,
 262                                                      int dst_slot, int src_slot,
 263                                                      int nr_items)
 264 {
 265         struct tree_mod_elem *tm;
 266
 267         tm = kzalloc(sizeof(*tm), GFP_NOFS);
 268         if (!tm)
 269                 return ERR_PTR(-ENOMEM);
 270
 271         tm->logical = eb->start;
 272         tm->slot = src_slot;
 273         tm->move.dst_slot = dst_slot;
 274         tm->move.nr_items = nr_items;
 275         tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
 276         RB_CLEAR_NODE(&tm->node);
 277
 278         return tm;
 279 }
 280
 281 int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
 282                                    int dst_slot, int src_slot,
 283                                    int nr_items)
 284 {
 285         struct tree_mod_elem *tm = NULL;
 286         struct tree_mod_elem **tm_list = NULL;
 287         int ret = 0;
 288         int i;
 289         bool locked = false;
 290
 291         if (!tree_mod_need_log(eb->fs_info, eb))
 292                 return 0;
 293
 294         tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
 295         if (!tm_list) {
 296                 ret = -ENOMEM;
 297                 goto lock;
 298         }
 299
 300         tm = tree_mod_log_alloc_move(eb, dst_slot, src_slot, nr_items);
 301         if (IS_ERR(tm)) {
 302                 ret = PTR_ERR(tm);
 303                 tm = NULL;
 304                 goto lock;
 305         }
 306
 307         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 308                 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
 309                                 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING);
 310                 if (!tm_list[i]) {
 311                         ret = -ENOMEM;
 312                         goto lock;
 313                 }
 314         }
 315
 316 lock:
 317         if (tree_mod_dont_log(eb->fs_info, eb)) {
 318                 /*
 319                  * Don't error if we failed to allocate memory because we don't
 320                  * need to log.
 321                  */
 322                 ret = 0;
 323                 goto free_tms;
 324         }
 325         locked = true;
 326
 327         /*
 328          * We previously failed to allocate memory and we need to log, so we
 329          * have to fail.
 330          */
 331         if (ret != 0)
 332                 goto free_tms;
 333
 334         /*
 335          * When we override something during the move, we log these removals.
 336          * This can only happen when we move towards the beginning of the
 337          * buffer, i.e. dst_slot < src_slot.
 338          */
 339         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 340                 ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
 341                 if (ret)
 342                         goto free_tms;
 343         }
 344
 345         ret = tree_mod_log_insert(eb->fs_info, tm);
 346         if (ret)
 347                 goto free_tms;
 348         write_unlock(&eb->fs_info->tree_mod_log_lock);
 349         kfree(tm_list);
 350
 351         return 0;
 352
 353 free_tms:
 354         if (tm_list) {
 355                 for (i = 0; i < nr_items; i++) {
 356                         if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
 357                                 rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
 358                         kfree(tm_list[i]);
 359                 }
 360         }
 361         if (locked)
 362                 write_unlock(&eb->fs_info->tree_mod_log_lock);
 363         kfree(tm_list);
 364         kfree(tm);
 365
 366         return ret;
 367 }
 368
 369 static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
 370                                 struct tree_mod_elem **tm_list,
 371                                 int nritems)
 372 {
 373         int i, j;
 374         int ret;
 375
 376         for (i = nritems - 1; i >= 0; i--) {
 377                 ret = tree_mod_log_insert(fs_info, tm_list[i]);
 378                 if (ret) {
 379                         for (j = nritems - 1; j > i; j--)
 380                                 rb_erase(&tm_list[j]->node,
 381                                          &fs_info->tree_mod_log);
 382                         return ret;
 383                 }
 384         }
 385
 386         return 0;
 387 }
 388
 389 int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
 390                                    struct extent_buffer *new_root,
 391                                    bool log_removal)
 392 {
 393         struct btrfs_fs_info *fs_info = old_root->fs_info;
 394         struct tree_mod_elem *tm = NULL;
 395         struct tree_mod_elem **tm_list = NULL;
 396         int nritems = 0;
 397         int ret = 0;
 398         int i;
 399
 400         if (!tree_mod_need_log(fs_info, NULL))
 401                 return 0;
 402
 403         if (log_removal && btrfs_header_level(old_root) > 0) {
 404                 nritems = btrfs_header_nritems(old_root);
 405                 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
 406                                   GFP_NOFS);
 407                 if (!tm_list) {
 408                         ret = -ENOMEM;
 409                         goto lock;
 410                 }
 411                 for (i = 0; i < nritems; i++) {
 412                         tm_list[i] = alloc_tree_mod_elem(old_root, i,
 413                             BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
 414                         if (!tm_list[i]) {
 415                                 ret = -ENOMEM;
 416                                 goto lock;
 417                         }
 418                 }
 419         }
 420
 421         tm = kzalloc(sizeof(*tm), GFP_NOFS);
 422         if (!tm) {
 423                 ret = -ENOMEM;
 424                 goto lock;
 425         }
 426
 427         tm->logical = new_root->start;
 428         tm->old_root.logical = old_root->start;
 429         tm->old_root.level = btrfs_header_level(old_root);
 430         tm->generation = btrfs_header_generation(old_root);
 431         tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
 432
 433 lock:
 434         if (tree_mod_dont_log(fs_info, NULL)) {
 435                 /*
 436                  * Don't error if we failed to allocate memory because we don't
 437                  * need to log.
 438                  */
 439                 ret = 0;
 440                 goto free_tms;
 441         } else if (ret != 0) {
 442                 /*
 443                  * We previously failed to allocate memory and we need to log,
 444                  * so we have to fail.
 445                  */
 446                 goto out_unlock;
 447         }
 448
 449         if (tm_list)
 450                 ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
 451         if (!ret)
 452                 ret = tree_mod_log_insert(fs_info, tm);
 453
 454 out_unlock:
 455         write_unlock(&fs_info->tree_mod_log_lock);
 456         if (ret)
 457                 goto free_tms;
 458         kfree(tm_list);
 459
 460         return ret;
 461
 462 free_tms:
 463         if (tm_list) {
 464                 for (i = 0; i < nritems; i++)
 465                         kfree(tm_list[i]);
 466                 kfree(tm_list);
 467         }
 468         kfree(tm);
 469
 470         return ret;
 471 }
 472
 473 static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
 474                                                    u64 start, u64 min_seq,
 475                                                    bool smallest)
 476 {
 477         struct rb_root *tm_root;
 478         struct rb_node *node;
 479         struct tree_mod_elem *cur = NULL;
 480         struct tree_mod_elem *found = NULL;
 481
 482         read_lock(&fs_info->tree_mod_log_lock);
 483         tm_root = &fs_info->tree_mod_log;
 484         node = tm_root->rb_node;
 485         while (node) {
 486                 cur = rb_entry(node, struct tree_mod_elem, node);
 487                 if (cur->logical < start) {
 488                         node = node->rb_left;
 489                 } else if (cur->logical > start) {
 490                         node = node->rb_right;
 491                 } else if (cur->seq < min_seq) {
 492                         node = node->rb_left;
 493                 } else if (!smallest) {
 494                         /* We want the node with the highest seq */
 495                         if (found)
 496                                 BUG_ON(found->seq > cur->seq);
 497                         found = cur;
 498                         node = node->rb_left;
 499                 } else if (cur->seq > min_seq) {
 500                         /* We want the node with the smallest seq */
 501                         if (found)
 502                                 BUG_ON(found->seq < cur->seq);
 503                         found = cur;
 504                         node = node->rb_right;
 505                 } else {
 506                         found = cur;
 507                         break;
 508                 }
 509         }
 510         read_unlock(&fs_info->tree_mod_log_lock);
 511
 512         return found;
 513 }
 514
 515 /*
 516  * This returns the element from the log with the smallest time sequence
 517  * value that's in the log (the oldest log item). Any element with a time
 518  * sequence lower than min_seq will be ignored.
 519  */
 520 static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
 521                                                         u64 start, u64 min_seq)
 522 {
 523         return __tree_mod_log_search(fs_info, start, min_seq, true);
 524 }
 525
 526 /*
 527  * This returns the element from the log with the largest time sequence
 528  * value that's in the log (the most recent log item). Any element with
 529  * a time sequence lower than min_seq will be ignored.
 530  */
 531 static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
 532                                                  u64 start, u64 min_seq)
 533 {
 534         return __tree_mod_log_search(fs_info, start, min_seq, false);
 535 }
 536
 537 int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
 538                                struct extent_buffer *src,
 539                                unsigned long dst_offset,
 540                                unsigned long src_offset,
 541                                int nr_items)
 542 {
 543         struct btrfs_fs_info *fs_info = dst->fs_info;
 544         int ret = 0;
 545         struct tree_mod_elem **tm_list = NULL;
 546         struct tree_mod_elem **tm_list_add = NULL;
 547         struct tree_mod_elem **tm_list_rem = NULL;
 548         int i;
 549         bool locked = false;
 550         struct tree_mod_elem *dst_move_tm = NULL;
 551         struct tree_mod_elem *src_move_tm = NULL;
 552         u32 dst_move_nr_items = btrfs_header_nritems(dst) - dst_offset;
 553         u32 src_move_nr_items = btrfs_header_nritems(src) - (src_offset + nr_items);
 554
 555         if (!tree_mod_need_log(fs_info, NULL))
 556                 return 0;
 557
 558         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
 559                 return 0;
 560
 561         tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
 562                           GFP_NOFS);
 563         if (!tm_list) {
 564                 ret = -ENOMEM;
 565                 goto lock;
 566         }
 567
 568         if (dst_move_nr_items) {
 569                 dst_move_tm = tree_mod_log_alloc_move(dst, dst_offset + nr_items,
 570                                                       dst_offset, dst_move_nr_items);
 571                 if (IS_ERR(dst_move_tm)) {
 572                         ret = PTR_ERR(dst_move_tm);
 573                         dst_move_tm = NULL;
 574                         goto lock;
 575                 }
 576         }
 577         if (src_move_nr_items) {
 578                 src_move_tm = tree_mod_log_alloc_move(src, src_offset,
 579                                                       src_offset + nr_items,
 580                                                       src_move_nr_items);
 581                 if (IS_ERR(src_move_tm)) {
 582                         ret = PTR_ERR(src_move_tm);
 583                         src_move_tm = NULL;
 584                         goto lock;
 585                 }
 586         }
 587
 588         tm_list_add = tm_list;
 589         tm_list_rem = tm_list + nr_items;
 590         for (i = 0; i < nr_items; i++) {
 591                 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
 592                                                      BTRFS_MOD_LOG_KEY_REMOVE);
 593                 if (!tm_list_rem[i]) {
 594                         ret = -ENOMEM;
 595                         goto lock;
 596                 }
 597
 598                 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
 599                                                      BTRFS_MOD_LOG_KEY_ADD);
 600                 if (!tm_list_add[i]) {
 601                         ret = -ENOMEM;
 602                         goto lock;
 603                 }
 604         }
 605
 606 lock:
 607         if (tree_mod_dont_log(fs_info, NULL)) {
 608                 /*
 609                  * Don't error if we failed to allocate memory because we don't
 610                  * need to log.
 611                  */
 612                 ret = 0;
 613                 goto free_tms;
 614         }
 615         locked = true;
 616
 617         /*
 618          * We previously failed to allocate memory and we need to log, so we
 619          * have to fail.
 620          */
 621         if (ret != 0)
 622                 goto free_tms;
 623
 624         if (dst_move_tm) {
 625                 ret = tree_mod_log_insert(fs_info, dst_move_tm);
 626                 if (ret)
 627                         goto free_tms;
 628         }
 629         for (i = 0; i < nr_items; i++) {
 630                 ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
 631                 if (ret)
 632                         goto free_tms;
 633                 ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
 634                 if (ret)
 635                         goto free_tms;
 636         }
 637         if (src_move_tm) {
 638                 ret = tree_mod_log_insert(fs_info, src_move_tm);
 639                 if (ret)
 640                         goto free_tms;
 641         }
 642
 643         write_unlock(&fs_info->tree_mod_log_lock);
 644         kfree(tm_list);
 645
 646         return 0;
 647
 648 free_tms:
 649         if (dst_move_tm && !RB_EMPTY_NODE(&dst_move_tm->node))
 650                 rb_erase(&dst_move_tm->node, &fs_info->tree_mod_log);
 651         kfree(dst_move_tm);
 652         if (src_move_tm && !RB_EMPTY_NODE(&src_move_tm->node))
 653                 rb_erase(&src_move_tm->node, &fs_info->tree_mod_log);
 654         kfree(src_move_tm);
 655         if (tm_list) {
 656                 for (i = 0; i < nr_items * 2; i++) {
 657                         if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
 658                                 rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
 659                         kfree(tm_list[i]);
 660                 }
 661         }
 662         if (locked)
 663                 write_unlock(&fs_info->tree_mod_log_lock);
 664         kfree(tm_list);
 665
 666         return ret;
 667 }
 668
 669 int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
 670 {
 671         struct tree_mod_elem **tm_list = NULL;
 672         int nritems = 0;
 673         int i;
 674         int ret = 0;
 675
 676         if (!tree_mod_need_log(eb->fs_info, eb))
 677                 return 0;
 678
 679         nritems = btrfs_header_nritems(eb);
 680         tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
 681         if (!tm_list) {
 682                 ret = -ENOMEM;
 683                 goto lock;
 684         }
 685
 686         for (i = 0; i < nritems; i++) {
 687                 tm_list[i] = alloc_tree_mod_elem(eb, i,
 688                                     BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
 689                 if (!tm_list[i]) {
 690                         ret = -ENOMEM;
 691                         goto lock;
 692                 }
 693         }
 694
 695 lock:
 696         if (tree_mod_dont_log(eb->fs_info, eb)) {
 697                 /*
 698                  * Don't error if we failed to allocate memory because we don't
 699                  * need to log.
 700                  */
 701                 ret = 0;
 702                 goto free_tms;
 703         } else if (ret != 0) {
 704                 /*
 705                  * We previously failed to allocate memory and we need to log,
 706                  * so we have to fail.
 707                  */
 708                 goto out_unlock;
 709         }
 710
 711         ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
 712 out_unlock:
 713         write_unlock(&eb->fs_info->tree_mod_log_lock);
 714         if (ret)
 715                 goto free_tms;
 716         kfree(tm_list);
 717
 718         return 0;
 719
 720 free_tms:
 721         if (tm_list) {
 722                 for (i = 0; i < nritems; i++)
 723                         kfree(tm_list[i]);
 724                 kfree(tm_list);
 725         }
 726
 727         return ret;
 728 }
 729
 730 /*
 731  * Returns the logical address of the oldest predecessor of the given root.
 732  * Entries older than time_seq are ignored.
 733  */
 734 static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
 735                                                       u64 time_seq)
 736 {
 737         struct tree_mod_elem *tm;
 738         struct tree_mod_elem *found = NULL;
 739         u64 root_logical = eb_root->start;
 740         bool looped = false;
 741
 742         if (!time_seq)
 743                 return NULL;
 744
 745         /*
 746          * The very last operation that's logged for a root is the replacement
 747          * operation (if it is replaced at all). This has the logical address
 748          * of the *new* root, making it the very first operation that's logged
 749          * for this root.
 750          */
 751         while (1) {
 752                 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
 753                                                 time_seq);
 754                 if (!looped && !tm)
 755                         return NULL;
 756                 /*
 757                  * If there are no tree operation for the oldest root, we simply
 758                  * return it. This should only happen if that (old) root is at
 759                  * level 0.
 760                  */
 761                 if (!tm)
 762                         break;
 763
 764                 /*
 765                  * If there's an operation that's not a root replacement, we
 766                  * found the oldest version of our root. Normally, we'll find a
 767                  * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
 768                  */
 769                 if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
 770                         break;
 771
 772                 found = tm;
 773                 root_logical = tm->old_root.logical;
 774                 looped = true;
 775         }
 776
 777         /* If there's no old root to return, return what we found instead */
 778         if (!found)
 779                 found = tm;
 780
 781         return found;
 782 }
 783
 784
 785 /*
 786  * tm is a pointer to the first operation to rewind within eb. Then, all
 787  * previous operations will be rewound (until we reach something older than
 788  * time_seq).
 789  */
 790 static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
 791                                 struct extent_buffer *eb,
 792                                 u64 time_seq,
 793                                 struct tree_mod_elem *first_tm)
 794 {
 795         u32 n;
 796         struct rb_node *next;
 797         struct tree_mod_elem *tm = first_tm;
 798         unsigned long o_dst;
 799         unsigned long o_src;
 800         unsigned long p_size = sizeof(struct btrfs_key_ptr);
 801         /*
 802          * max_slot tracks the maximum valid slot of the rewind eb at every
 803          * step of the rewind. This is in contrast with 'n' which eventually
 804          * matches the number of items, but can be wrong during moves or if
 805          * removes overlap on already valid slots (which is probably separately
 806          * a bug). We do this to validate the offsets of memmoves for rewinding
 807          * moves and detect invalid memmoves.
 808          *
 809          * Since a rewind eb can start empty, max_slot is a signed integer with
 810          * a special meaning for -1, which is that no slot is valid to move out
 811          * of. Any other negative value is invalid.
 812          */
 813         int max_slot;
 814         int move_src_end_slot;
 815         int move_dst_end_slot;
 816
 817         n = btrfs_header_nritems(eb);
 818         max_slot = n - 1;
 819         read_lock(&fs_info->tree_mod_log_lock);
 820         while (tm && tm->seq >= time_seq) {
 821                 ASSERT(max_slot >= -1);
 822                 /*
 823                  * All the operations are recorded with the operator used for
 824                  * the modification. As we're going backwards, we do the
 825                  * opposite of each operation here.
 826                  */
 827                 switch (tm->op) {
 828                 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
 829                         BUG_ON(tm->slot < n);
 830                         fallthrough;
 831                 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
 832                 case BTRFS_MOD_LOG_KEY_REMOVE:
 833                         btrfs_set_node_key(eb, &tm->key, tm->slot);
 834                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
 835                         btrfs_set_node_ptr_generation(eb, tm->slot,
 836                                                       tm->generation);
 837                         n++;
 838                         if (tm->slot > max_slot)
 839                                 max_slot = tm->slot;
 840                         break;
 841                 case BTRFS_MOD_LOG_KEY_REPLACE:
 842                         BUG_ON(tm->slot >= n);
 843                         btrfs_set_node_key(eb, &tm->key, tm->slot);
 844                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
 845                         btrfs_set_node_ptr_generation(eb, tm->slot,
 846                                                       tm->generation);
 847                         break;
 848                 case BTRFS_MOD_LOG_KEY_ADD:
 849                         /*
 850                          * It is possible we could have already removed keys
 851                          * behind the known max slot, so this will be an
 852                          * overestimate. In practice, the copy operation
 853                          * inserts them in increasing order, and overestimating
 854                          * just means we miss some warnings, so it's OK. It
 855                          * isn't worth carefully tracking the full array of
 856                          * valid slots to check against when moving.
 857                          */
 858                         if (tm->slot == max_slot)
 859                                 max_slot--;
 860                         /* if a move operation is needed it's in the log */
 861                         n--;
 862                         break;
 863                 case BTRFS_MOD_LOG_MOVE_KEYS:
 864                         ASSERT(tm->move.nr_items > 0);
 865                         move_src_end_slot = tm->move.dst_slot + tm->move.nr_items - 1;
 866                         move_dst_end_slot = tm->slot + tm->move.nr_items - 1;
 867                         o_dst = btrfs_node_key_ptr_offset(eb, tm->slot);
 868                         o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot);
 869                         if (WARN_ON(move_src_end_slot > max_slot ||
 870                                     tm->move.nr_items <= 0)) {
 871                                 btrfs_warn(fs_info,
 872 "move from invalid tree mod log slot eb %llu slot %d dst_slot %d nr_items %d seq %llu n %u max_slot %d",
 873                                            eb->start, tm->slot,
 874                                            tm->move.dst_slot, tm->move.nr_items,
 875                                            tm->seq, n, max_slot);
 876                         }
 877                         memmove_extent_buffer(eb, o_dst, o_src,
 878                                               tm->move.nr_items * p_size);
 879                         max_slot = move_dst_end_slot;
 880                         break;
 881                 case BTRFS_MOD_LOG_ROOT_REPLACE:
 882                         /*
 883                          * This operation is special. For roots, this must be
 884                          * handled explicitly before rewinding.
 885                          * For non-roots, this operation may exist if the node
 886                          * was a root: root A -> child B; then A gets empty and
 887                          * B is promoted to the new root. In the mod log, we'll
 888                          * have a root-replace operation for B, a tree block
 889                          * that is no root. We simply ignore that operation.
 890                          */
 891                         break;
 892                 }
 893                 next = rb_next(&tm->node);
 894                 if (!next)
 895                         break;
 896                 tm = rb_entry(next, struct tree_mod_elem, node);
 897                 if (tm->logical != first_tm->logical)
 898                         break;
 899         }
 900         read_unlock(&fs_info->tree_mod_log_lock);
 901         btrfs_set_header_nritems(eb, n);
 902 }
 903
 904 /*
 905  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
 906  * is returned. If rewind operations happen, a fresh buffer is returned. The
 907  * returned buffer is always read-locked. If the returned buffer is not the
 908  * input buffer, the lock on the input buffer is released and the input buffer
 909  * is freed (its refcount is decremented).
 910  */
 911 struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
 912                                                 struct btrfs_path *path,
 913                                                 struct extent_buffer *eb,
 914                                                 u64 time_seq)
 915 {
 916         struct extent_buffer *eb_rewin;
 917         struct tree_mod_elem *tm;
 918
 919         if (!time_seq)
 920                 return eb;
 921
 922         if (btrfs_header_level(eb) == 0)
 923                 return eb;
 924
 925         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
 926         if (!tm)
 927                 return eb;
 928
 929         if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
 930                 BUG_ON(tm->slot != 0);
 931                 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
 932                 if (!eb_rewin) {
 933                         btrfs_tree_read_unlock(eb);
 934                         free_extent_buffer(eb);
 935                         return NULL;
 936                 }
 937                 btrfs_set_header_bytenr(eb_rewin, eb->start);
 938                 btrfs_set_header_backref_rev(eb_rewin,
 939                                              btrfs_header_backref_rev(eb));
 940                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
 941                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
 942         } else {
 943                 eb_rewin = btrfs_clone_extent_buffer(eb);
 944                 if (!eb_rewin) {
 945                         btrfs_tree_read_unlock(eb);
 946                         free_extent_buffer(eb);
 947                         return NULL;
 948                 }
 949         }
 950
 951         btrfs_tree_read_unlock(eb);
 952         free_extent_buffer(eb);
 953
 954         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
 955                                        eb_rewin, btrfs_header_level(eb_rewin));
 956         btrfs_tree_read_lock(eb_rewin);
 957         tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
 958         WARN_ON(btrfs_header_nritems(eb_rewin) >
 959                 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
 960
 961         return eb_rewin;
 962 }
 963
 964 /*
 965  * Rewind the state of @root's root node to the given @time_seq value.
 966  * If there are no changes, the current root->root_node is returned. If anything
 967  * changed in between, there's a fresh buffer allocated on which the rewind
 968  * operations are done. In any case, the returned buffer is read locked.
 969  * Returns NULL on error (with no locks held).
 970  */
 971 struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
 972 {
 973         struct btrfs_fs_info *fs_info = root->fs_info;
 974         struct tree_mod_elem *tm;
 975         struct extent_buffer *eb = NULL;
 976         struct extent_buffer *eb_root;
 977         u64 eb_root_owner = 0;
 978         struct extent_buffer *old;
 979         struct tree_mod_root *old_root = NULL;
 980         u64 old_generation = 0;
 981         u64 logical;
 982         int level;
 983
 984         eb_root = btrfs_read_lock_root_node(root);
 985         tm = tree_mod_log_oldest_root(eb_root, time_seq);
 986         if (!tm)
 987                 return eb_root;
 988
 989         if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
 990                 old_root = &tm->old_root;
 991                 old_generation = tm->generation;
 992                 logical = old_root->logical;
 993                 level = old_root->level;
 994         } else {
 995                 logical = eb_root->start;
 996                 level = btrfs_header_level(eb_root);
 997         }
 998
 999         tm = tree_mod_log_search(fs_info, logical, time_seq);
1000         if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1001                 struct btrfs_tree_parent_check check = { 0 };
1002
1003                 btrfs_tree_read_unlock(eb_root);
1004                 free_extent_buffer(eb_root);
1005
1006                 check.level = level;
1007                 check.owner_root = root->root_key.objectid;
1008
1009                 old = read_tree_block(fs_info, logical, &check);
1010                 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1011                         if (!IS_ERR(old))
1012                                 free_extent_buffer(old);
1013                         btrfs_warn(fs_info,
1014                                    "failed to read tree block %llu from get_old_root",
1015                                    logical);
1016                 } else {
1017                         struct tree_mod_elem *tm2;
1018
1019                         btrfs_tree_read_lock(old);
1020                         eb = btrfs_clone_extent_buffer(old);
1021                         /*
1022                          * After the lookup for the most recent tree mod operation
1023                          * above and before we locked and cloned the extent buffer
1024                          * 'old', a new tree mod log operation may have been added.
1025                          * So lookup for a more recent one to make sure the number
1026                          * of mod log operations we replay is consistent with the
1027                          * number of items we have in the cloned extent buffer,
1028                          * otherwise we can hit a BUG_ON when rewinding the extent
1029                          * buffer.
1030                          */
1031                         tm2 = tree_mod_log_search(fs_info, logical, time_seq);
1032                         btrfs_tree_read_unlock(old);
1033                         free_extent_buffer(old);
1034                         ASSERT(tm2);
1035                         ASSERT(tm2 == tm || tm2->seq > tm->seq);
1036                         if (!tm2 || tm2->seq < tm->seq) {
1037                                 free_extent_buffer(eb);
1038                                 return NULL;
1039                         }
1040                         tm = tm2;
1041                 }
1042         } else if (old_root) {
1043                 eb_root_owner = btrfs_header_owner(eb_root);
1044                 btrfs_tree_read_unlock(eb_root);
1045                 free_extent_buffer(eb_root);
1046                 eb = alloc_dummy_extent_buffer(fs_info, logical);
1047         } else {
1048                 eb = btrfs_clone_extent_buffer(eb_root);
1049                 btrfs_tree_read_unlock(eb_root);
1050                 free_extent_buffer(eb_root);
1051         }
1052
1053         if (!eb)
1054                 return NULL;
1055         if (old_root) {
1056                 btrfs_set_header_bytenr(eb, eb->start);
1057                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1058                 btrfs_set_header_owner(eb, eb_root_owner);
1059                 btrfs_set_header_level(eb, old_root->level);
1060                 btrfs_set_header_generation(eb, old_generation);
1061         }
1062         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
1063                                        btrfs_header_level(eb));
1064         btrfs_tree_read_lock(eb);
1065         if (tm)
1066                 tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1067         else
1068                 WARN_ON(btrfs_header_level(eb) != 0);
1069         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1070
1071         return eb;
1072 }
1073
1074 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1075 {
1076         struct tree_mod_elem *tm;
1077         int level;
1078         struct extent_buffer *eb_root = btrfs_root_node(root);
1079
1080         tm = tree_mod_log_oldest_root(eb_root, time_seq);
1081         if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
1082                 level = tm->old_root.level;
1083         else
1084                 level = btrfs_header_level(eb_root);
1085
1086         free_extent_buffer(eb_root);
1087
1088         return level;
1089 }
1090
1091 /*
1092  * Return the lowest sequence number in the tree modification log.
1093  *
1094  * Return the sequence number of the oldest tree modification log user, which
1095  * corresponds to the lowest sequence number of all existing users. If there are
1096  * no users it returns 0.
1097  */
1098 u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info)
1099 {
1100         u64 ret = 0;
1101
1102         read_lock(&fs_info->tree_mod_log_lock);
1103         if (!list_empty(&fs_info->tree_mod_seq_list)) {
1104                 struct btrfs_seq_list *elem;
1105
1106                 elem = list_first_entry(&fs_info->tree_mod_seq_list,
1107                                         struct btrfs_seq_list, list);
1108                 ret = elem->seq;
1109         }
1110         read_unlock(&fs_info->tree_mod_log_lock);
1111
1112         return ret;
1113 }